mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2025-01-06 19:49:41 +03:00
uint -> size_t
This commit is contained in:
parent
9ec1bef6fb
commit
9861ecbbe5
@ -38,22 +38,21 @@
|
|||||||
|
|
||||||
typedef struct _cmd CMD;
|
typedef struct _cmd CMD;
|
||||||
|
|
||||||
struct _cmd
|
struct _cmd {
|
||||||
{
|
CMD * next;
|
||||||
CMD * next;
|
CMD * tail; /* valid on in head */
|
||||||
CMD * tail; /* valid on in head */
|
RULE * rule; /* rule->actions contains shell script */
|
||||||
RULE * rule; /* rule->actions contains shell script */
|
LIST * shell; /* $(SHELL) value */
|
||||||
LIST * shell; /* $(SHELL) value */
|
LOL args; /* LISTs for $(<), $(>) */
|
||||||
LOL args; /* LISTs for $(<), $(>) */
|
char * buf; /* actual commands */
|
||||||
char * buf; /* actual commands */
|
|
||||||
};
|
};
|
||||||
|
|
||||||
CMD * cmd_new
|
CMD * cmd_new
|
||||||
(
|
(
|
||||||
RULE * rule, /* rule (referenced) */
|
RULE * rule, /* rule (referenced) */
|
||||||
LIST * targets, /* $(<) (freed) */
|
LIST * targets, /* $(<) (freed) */
|
||||||
LIST * sources, /* $(>) (freed) */
|
LIST * sources, /* $(>) (freed) */
|
||||||
LIST * shell /* $(SHELL) (freed) */
|
LIST * shell /* $(SHELL) (freed) */
|
||||||
);
|
);
|
||||||
|
|
||||||
void cmd_free( CMD * );
|
void cmd_free( CMD * );
|
||||||
|
@ -10,35 +10,33 @@
|
|||||||
#include <time.h>
|
#include <time.h>
|
||||||
|
|
||||||
|
|
||||||
struct profile_info
|
struct profile_info {
|
||||||
{
|
/* name of rule being called */
|
||||||
/* name of rule being called */
|
char* name;
|
||||||
char* name;
|
/* cumulative time spent in rule */
|
||||||
/* cumulative time spent in rule */
|
clock_t cumulative;
|
||||||
clock_t cumulative;
|
/* time spent in rule proper */
|
||||||
/* time spent in rule proper */
|
clock_t net;
|
||||||
clock_t net;
|
/* number of time rule was entered */
|
||||||
/* number of time rule was entered */
|
unsigned long num_entries;
|
||||||
unsigned long num_entries;
|
/* number of the times this function is present in stack */
|
||||||
/* number of the times this function is present in stack */
|
unsigned long stack_count;
|
||||||
unsigned long stack_count;
|
/* bytes of memory allocated by the call */
|
||||||
/* bytes of memory allocated by the call */
|
unsigned long memory;
|
||||||
unsigned long memory;
|
|
||||||
};
|
};
|
||||||
typedef struct profile_info profile_info;
|
typedef struct profile_info profile_info;
|
||||||
|
|
||||||
struct profile_frame
|
struct profile_frame {
|
||||||
{
|
/* permanent storage where data accumulates */
|
||||||
/* permanent storage where data accumulates */
|
profile_info* info;
|
||||||
profile_info* info;
|
/* overhead for profiling in this call */
|
||||||
/* overhead for profiling in this call */
|
clock_t overhead;
|
||||||
clock_t overhead;
|
/* time of last entry to rule */
|
||||||
/* time of last entry to rule */
|
clock_t entry_time;
|
||||||
clock_t entry_time;
|
/* stack frame of caller */
|
||||||
/* stack frame of caller */
|
struct profile_frame* caller;
|
||||||
struct profile_frame* caller;
|
/* time spent in subrules */
|
||||||
/* time spent in subrules */
|
clock_t subrules;
|
||||||
clock_t subrules;
|
|
||||||
};
|
};
|
||||||
typedef struct profile_frame profile_frame;
|
typedef struct profile_frame profile_frame;
|
||||||
|
|
||||||
|
@ -18,22 +18,21 @@
|
|||||||
|
|
||||||
#include <time.h>
|
#include <time.h>
|
||||||
|
|
||||||
typedef struct timing_info
|
typedef struct timing_info {
|
||||||
{
|
double system;
|
||||||
double system;
|
double user;
|
||||||
double user;
|
time_t start;
|
||||||
time_t start;
|
time_t end;
|
||||||
time_t end;
|
|
||||||
} timing_info;
|
} timing_info;
|
||||||
|
|
||||||
void exec_cmd
|
void exec_cmd
|
||||||
(
|
(
|
||||||
char * string,
|
char * string,
|
||||||
void (* func)( void * closure, int status, timing_info *, char *, char * ),
|
void (* func)( void * closure, int status, timing_info *, char *, char * ),
|
||||||
void * closure,
|
void * closure,
|
||||||
LIST * shell,
|
LIST * shell,
|
||||||
char * action,
|
char * action,
|
||||||
char * target
|
char * target
|
||||||
);
|
);
|
||||||
|
|
||||||
int exec_wait();
|
int exec_wait();
|
||||||
|
@ -33,14 +33,13 @@ int file_is_file(char* filename);
|
|||||||
int file_mkdir(char *pathname);
|
int file_mkdir(char *pathname);
|
||||||
|
|
||||||
typedef struct file_info_t file_info_t ;
|
typedef struct file_info_t file_info_t ;
|
||||||
struct file_info_t
|
struct file_info_t {
|
||||||
{
|
char * name;
|
||||||
char * name;
|
short is_file;
|
||||||
short is_file;
|
short is_dir;
|
||||||
short is_dir;
|
unsigned long size;
|
||||||
unsigned long size;
|
time_t time;
|
||||||
time_t time;
|
LIST * files;
|
||||||
LIST * files;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -12,15 +12,14 @@
|
|||||||
typedef struct _PARSE PARSE;
|
typedef struct _PARSE PARSE;
|
||||||
typedef struct frame FRAME;
|
typedef struct frame FRAME;
|
||||||
|
|
||||||
struct frame
|
struct frame {
|
||||||
{
|
FRAME * prev;
|
||||||
FRAME * prev;
|
/* The nearest enclosing frame for which module->user_module is true. */
|
||||||
/* The nearest enclosing frame for which module->user_module is true. */
|
FRAME * prev_user;
|
||||||
FRAME * prev_user;
|
LOL args[ 1 ];
|
||||||
LOL args[ 1 ];
|
module_t * module;
|
||||||
module_t * module;
|
PARSE * procedure;
|
||||||
PARSE * procedure;
|
char * rulename;
|
||||||
char * rulename;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -91,7 +91,7 @@
|
|||||||
#include <ctype.h>
|
#include <ctype.h>
|
||||||
#include <malloc.h>
|
#include <malloc.h>
|
||||||
#ifndef __MWERKS__
|
#ifndef __MWERKS__
|
||||||
#include <memory.h>
|
#include <memory.h>
|
||||||
#endif
|
#endif
|
||||||
#include <signal.h>
|
#include <signal.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
@ -113,17 +113,17 @@
|
|||||||
/* AS400 cross-compile from NT. */
|
/* AS400 cross-compile from NT. */
|
||||||
|
|
||||||
#ifdef AS400
|
#ifdef AS400
|
||||||
#undef OSMINOR
|
#undef OSMINOR
|
||||||
#undef OSMAJOR
|
#undef OSMAJOR
|
||||||
#define OSMAJOR "AS400=true"
|
#define OSMAJOR "AS400=true"
|
||||||
#define OSMINOR "OS=AS400"
|
#define OSMINOR "OS=AS400"
|
||||||
#define OS_AS400
|
#define OS_AS400
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Metrowerks Standard Library on Windows. */
|
/* Metrowerks Standard Library on Windows. */
|
||||||
|
|
||||||
#ifdef __MSL__
|
#ifdef __MSL__
|
||||||
#undef HAVE_POPEN
|
#undef HAVE_POPEN
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
# endif
|
# endif
|
||||||
@ -182,7 +182,7 @@
|
|||||||
#define DOWNSHIFT_PATHS
|
#define DOWNSHIFT_PATHS
|
||||||
|
|
||||||
#ifdef __EMX__
|
#ifdef __EMX__
|
||||||
#define USE_FILEUNIX
|
#define USE_FILEUNIX
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
@ -218,181 +218,181 @@
|
|||||||
#define PATH_DELIM '/'
|
#define PATH_DELIM '/'
|
||||||
|
|
||||||
#ifdef _AIX
|
#ifdef _AIX
|
||||||
#define unix
|
#define unix
|
||||||
#define MAXLINE 23552 /* 24k - 1k, longest 'together' actions */
|
#define MAXLINE 23552 /* 24k - 1k, longest 'together' actions */
|
||||||
#define OSMINOR "OS=AIX"
|
#define OSMINOR "OS=AIX"
|
||||||
#define OS_AIX
|
#define OS_AIX
|
||||||
#define NO_VFORK
|
#define NO_VFORK
|
||||||
#endif
|
#endif
|
||||||
#ifdef AMIGA
|
#ifdef AMIGA
|
||||||
#define OSMINOR "OS=AMIGA"
|
#define OSMINOR "OS=AMIGA"
|
||||||
#define OS_AMIGA
|
#define OS_AMIGA
|
||||||
#endif
|
#endif
|
||||||
#ifdef __BEOS__
|
#ifdef __BEOS__
|
||||||
#define unix
|
#define unix
|
||||||
#define OSMINOR "OS=BEOS"
|
#define OSMINOR "OS=BEOS"
|
||||||
#define OS_BEOS
|
#define OS_BEOS
|
||||||
#define NO_VFORK
|
#define NO_VFORK
|
||||||
#endif
|
#endif
|
||||||
#ifdef __bsdi__
|
#ifdef __bsdi__
|
||||||
#define OSMINOR "OS=BSDI"
|
#define OSMINOR "OS=BSDI"
|
||||||
#define OS_BSDI
|
#define OS_BSDI
|
||||||
#endif
|
#endif
|
||||||
#if defined (COHERENT) && defined (_I386)
|
#if defined (COHERENT) && defined (_I386)
|
||||||
#define OSMINOR "OS=COHERENT"
|
#define OSMINOR "OS=COHERENT"
|
||||||
#define OS_COHERENT
|
#define OS_COHERENT
|
||||||
#define NO_VFORK
|
#define NO_VFORK
|
||||||
#endif
|
#endif
|
||||||
#if defined(__cygwin__) || defined(__CYGWIN__)
|
#if defined(__cygwin__) || defined(__CYGWIN__)
|
||||||
#define OSMINOR "OS=CYGWIN"
|
#define OSMINOR "OS=CYGWIN"
|
||||||
#define OS_CYGWIN
|
#define OS_CYGWIN
|
||||||
#endif
|
#endif
|
||||||
#if defined(__FreeBSD__) && !defined(__DragonFly__)
|
#if defined(__FreeBSD__) && !defined(__DragonFly__)
|
||||||
#define OSMINOR "OS=FREEBSD"
|
#define OSMINOR "OS=FREEBSD"
|
||||||
#define OS_FREEBSD
|
#define OS_FREEBSD
|
||||||
#endif
|
#endif
|
||||||
#ifdef __DragonFly__
|
#ifdef __DragonFly__
|
||||||
#define OSMINOR "OS=DRAGONFLYBSD"
|
#define OSMINOR "OS=DRAGONFLYBSD"
|
||||||
#define OS_DRAGONFLYBSD
|
#define OS_DRAGONFLYBSD
|
||||||
#endif
|
#endif
|
||||||
#ifdef __DGUX__
|
#ifdef __DGUX__
|
||||||
#define OSMINOR "OS=DGUX"
|
#define OSMINOR "OS=DGUX"
|
||||||
#define OS_DGUX
|
#define OS_DGUX
|
||||||
#endif
|
#endif
|
||||||
#ifdef __hpux
|
#ifdef __hpux
|
||||||
#define OSMINOR "OS=HPUX"
|
#define OSMINOR "OS=HPUX"
|
||||||
#define OS_HPUX
|
#define OS_HPUX
|
||||||
#endif
|
#endif
|
||||||
#ifdef __OPENNT
|
#ifdef __OPENNT
|
||||||
#define unix
|
#define unix
|
||||||
#define OSMINOR "OS=INTERIX"
|
#define OSMINOR "OS=INTERIX"
|
||||||
#define OS_INTERIX
|
#define OS_INTERIX
|
||||||
#define NO_VFORK
|
#define NO_VFORK
|
||||||
#endif
|
#endif
|
||||||
#ifdef __sgi
|
#ifdef __sgi
|
||||||
#define OSMINOR "OS=IRIX"
|
#define OSMINOR "OS=IRIX"
|
||||||
#define OS_IRIX
|
#define OS_IRIX
|
||||||
#define NO_VFORK
|
#define NO_VFORK
|
||||||
#endif
|
#endif
|
||||||
#ifdef __ISC
|
#ifdef __ISC
|
||||||
#define OSMINOR "OS=ISC"
|
#define OSMINOR "OS=ISC"
|
||||||
#define OS_ISC
|
#define OS_ISC
|
||||||
#define NO_VFORK
|
#define NO_VFORK
|
||||||
#endif
|
#endif
|
||||||
#ifdef linux
|
#ifdef linux
|
||||||
#define OSMINOR "OS=LINUX"
|
#define OSMINOR "OS=LINUX"
|
||||||
#define OS_LINUX
|
#define OS_LINUX
|
||||||
#endif
|
#endif
|
||||||
#ifdef __Lynx__
|
#ifdef __Lynx__
|
||||||
#define OSMINOR "OS=LYNX"
|
#define OSMINOR "OS=LYNX"
|
||||||
#define OS_LYNX
|
#define OS_LYNX
|
||||||
#define NO_VFORK
|
#define NO_VFORK
|
||||||
#define unix
|
#define unix
|
||||||
#endif
|
#endif
|
||||||
#ifdef __MACHTEN__
|
#ifdef __MACHTEN__
|
||||||
#define OSMINOR "OS=MACHTEN"
|
#define OSMINOR "OS=MACHTEN"
|
||||||
#define OS_MACHTEN
|
#define OS_MACHTEN
|
||||||
#endif
|
#endif
|
||||||
#ifdef mpeix
|
#ifdef mpeix
|
||||||
#define unix
|
#define unix
|
||||||
#define OSMINOR "OS=MPEIX"
|
#define OSMINOR "OS=MPEIX"
|
||||||
#define OS_MPEIX
|
#define OS_MPEIX
|
||||||
#define NO_VFORK
|
#define NO_VFORK
|
||||||
#endif
|
#endif
|
||||||
#ifdef __MVS__
|
#ifdef __MVS__
|
||||||
#define unix
|
#define unix
|
||||||
#define OSMINOR "OS=MVS"
|
#define OSMINOR "OS=MVS"
|
||||||
#define OS_MVS
|
#define OS_MVS
|
||||||
#endif
|
#endif
|
||||||
#ifdef _ATT4
|
#ifdef _ATT4
|
||||||
#define OSMINOR "OS=NCR"
|
#define OSMINOR "OS=NCR"
|
||||||
#define OS_NCR
|
#define OS_NCR
|
||||||
#endif
|
#endif
|
||||||
#ifdef __NetBSD__
|
#ifdef __NetBSD__
|
||||||
#define unix
|
#define unix
|
||||||
#define OSMINOR "OS=NETBSD"
|
#define OSMINOR "OS=NETBSD"
|
||||||
#define OS_NETBSD
|
#define OS_NETBSD
|
||||||
#define NO_VFORK
|
#define NO_VFORK
|
||||||
#endif
|
#endif
|
||||||
#ifdef __QNX__
|
#ifdef __QNX__
|
||||||
#define unix
|
#define unix
|
||||||
#ifdef __QNXNTO__
|
#ifdef __QNXNTO__
|
||||||
#define OSMINOR "OS=QNXNTO"
|
#define OSMINOR "OS=QNXNTO"
|
||||||
#define OS_QNXNTO
|
#define OS_QNXNTO
|
||||||
#else
|
#else
|
||||||
#define OSMINOR "OS=QNX"
|
#define OSMINOR "OS=QNX"
|
||||||
#define OS_QNX
|
#define OS_QNX
|
||||||
#define NO_VFORK
|
#define NO_VFORK
|
||||||
#define MAXLINE 996
|
#define MAXLINE 996
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
#ifdef NeXT
|
#ifdef NeXT
|
||||||
#ifdef __APPLE__
|
#ifdef __APPLE__
|
||||||
#define OSMINOR "OS=RHAPSODY"
|
#define OSMINOR "OS=RHAPSODY"
|
||||||
#define OS_RHAPSODY
|
#define OS_RHAPSODY
|
||||||
#else
|
#else
|
||||||
#define OSMINOR "OS=NEXT"
|
#define OSMINOR "OS=NEXT"
|
||||||
#define OS_NEXT
|
#define OS_NEXT
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
#ifdef __APPLE__
|
#ifdef __APPLE__
|
||||||
#define unix
|
#define unix
|
||||||
#define OSMINOR "OS=MACOSX"
|
#define OSMINOR "OS=MACOSX"
|
||||||
#define OS_MACOSX
|
#define OS_MACOSX
|
||||||
#endif
|
#endif
|
||||||
#ifdef __osf__
|
#ifdef __osf__
|
||||||
#ifndef unix
|
#ifndef unix
|
||||||
#define unix
|
#define unix
|
||||||
#endif
|
#endif
|
||||||
#define OSMINOR "OS=OSF"
|
#define OSMINOR "OS=OSF"
|
||||||
#define OS_OSF
|
#define OS_OSF
|
||||||
#endif
|
#endif
|
||||||
#ifdef _SEQUENT_
|
#ifdef _SEQUENT_
|
||||||
#define OSMINOR "OS=PTX"
|
#define OSMINOR "OS=PTX"
|
||||||
#define OS_PTX
|
#define OS_PTX
|
||||||
#endif
|
#endif
|
||||||
#ifdef M_XENIX
|
#ifdef M_XENIX
|
||||||
#define OSMINOR "OS=SCO"
|
#define OSMINOR "OS=SCO"
|
||||||
#define OS_SCO
|
#define OS_SCO
|
||||||
#define NO_VFORK
|
#define NO_VFORK
|
||||||
#endif
|
#endif
|
||||||
#ifdef sinix
|
#ifdef sinix
|
||||||
#define unix
|
#define unix
|
||||||
#define OSMINOR "OS=SINIX"
|
#define OSMINOR "OS=SINIX"
|
||||||
#define OS_SINIX
|
#define OS_SINIX
|
||||||
#endif
|
#endif
|
||||||
#ifdef sun
|
#ifdef sun
|
||||||
#if defined(__svr4__) || defined(__SVR4)
|
#if defined(__svr4__) || defined(__SVR4)
|
||||||
#define OSMINOR "OS=SOLARIS"
|
#define OSMINOR "OS=SOLARIS"
|
||||||
#define OS_SOLARIS
|
#define OS_SOLARIS
|
||||||
#else
|
#else
|
||||||
#define OSMINOR "OS=SUNOS"
|
#define OSMINOR "OS=SUNOS"
|
||||||
#define OS_SUNOS
|
#define OS_SUNOS
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
#ifdef ultrix
|
#ifdef ultrix
|
||||||
#define OSMINOR "OS=ULTRIX"
|
#define OSMINOR "OS=ULTRIX"
|
||||||
#define OS_ULTRIX
|
#define OS_ULTRIX
|
||||||
#endif
|
#endif
|
||||||
#ifdef _UNICOS
|
#ifdef _UNICOS
|
||||||
#define OSMINOR "OS=UNICOS"
|
#define OSMINOR "OS=UNICOS"
|
||||||
#define OS_UNICOS
|
#define OS_UNICOS
|
||||||
#endif
|
#endif
|
||||||
#if defined(__USLC__) && !defined(M_XENIX)
|
#if defined(__USLC__) && !defined(M_XENIX)
|
||||||
#define OSMINOR "OS=UNIXWARE"
|
#define OSMINOR "OS=UNIXWARE"
|
||||||
#define OS_UNIXWARE
|
#define OS_UNIXWARE
|
||||||
#endif
|
#endif
|
||||||
#ifdef __OpenBSD__
|
#ifdef __OpenBSD__
|
||||||
#define OSMINOR "OS=OPENBSD"
|
#define OSMINOR "OS=OPENBSD"
|
||||||
#define OS_OPENBSD
|
#define OS_OPENBSD
|
||||||
#define unix
|
#define unix
|
||||||
#endif
|
#endif
|
||||||
#if defined (__FreeBSD_kernel__) && !defined(__FreeBSD__)
|
#if defined (__FreeBSD_kernel__) && !defined(__FreeBSD__)
|
||||||
#define OSMINOR "OS=KFREEBSD"
|
#define OSMINOR "OS=KFREEBSD"
|
||||||
#define OS_KFREEBSD
|
#define OS_KFREEBSD
|
||||||
#endif
|
#endif
|
||||||
#ifndef OSMINOR
|
#ifndef OSMINOR
|
||||||
#define OSMINOR "OS=UNKNOWN"
|
#define OSMINOR "OS=UNKNOWN"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* All the UNIX includes */
|
/* All the UNIX includes */
|
||||||
@ -401,7 +401,7 @@
|
|||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
|
|
||||||
#ifndef OS_MPEIX
|
#ifndef OS_MPEIX
|
||||||
#include <sys/file.h>
|
#include <sys/file.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include <fcntl.h>
|
#include <fcntl.h>
|
||||||
@ -413,11 +413,11 @@
|
|||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
|
||||||
#ifndef OS_QNX
|
#ifndef OS_QNX
|
||||||
#include <memory.h>
|
#include <memory.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef OS_ULTRIX
|
#ifndef OS_ULTRIX
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if !defined( OS_BSDI ) && \
|
#if !defined( OS_BSDI ) && \
|
||||||
@ -429,7 +429,7 @@
|
|||||||
!defined( OS_RHAPSODY ) && \
|
!defined( OS_RHAPSODY ) && \
|
||||||
!defined( OS_MVS ) && \
|
!defined( OS_MVS ) && \
|
||||||
!defined( OS_OPENBSD )
|
!defined( OS_OPENBSD )
|
||||||
#include <malloc.h>
|
#include <malloc.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
@ -443,57 +443,57 @@
|
|||||||
defined( ppc ) || \
|
defined( ppc ) || \
|
||||||
defined( __powerpc__ ) || \
|
defined( __powerpc__ ) || \
|
||||||
defined( __ppc__ )
|
defined( __ppc__ )
|
||||||
#define OSPLAT "OSPLAT=PPC"
|
#define OSPLAT "OSPLAT=PPC"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined( _ALPHA_ ) || \
|
#if defined( _ALPHA_ ) || \
|
||||||
defined( __alpha__ )
|
defined( __alpha__ )
|
||||||
#define OSPLAT "OSPLAT=AXP"
|
#define OSPLAT "OSPLAT=AXP"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined( _i386_ ) || \
|
#if defined( _i386_ ) || \
|
||||||
defined( __i386__ ) || \
|
defined( __i386__ ) || \
|
||||||
defined( __i386 ) || \
|
defined( __i386 ) || \
|
||||||
defined( _M_IX86 )
|
defined( _M_IX86 )
|
||||||
#define OSPLAT "OSPLAT=X86"
|
#define OSPLAT "OSPLAT=X86"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined( __ia64__ ) || \
|
#if defined( __ia64__ ) || \
|
||||||
defined( __IA64__ ) || \
|
defined( __IA64__ ) || \
|
||||||
defined( __ia64 )
|
defined( __ia64 )
|
||||||
#define OSPLAT "OSPLAT=IA64"
|
#define OSPLAT "OSPLAT=IA64"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined( __x86_64__ ) || \
|
#if defined( __x86_64__ ) || \
|
||||||
defined( __amd64__ ) || \
|
defined( __amd64__ ) || \
|
||||||
defined( _M_AMD64 )
|
defined( _M_AMD64 )
|
||||||
#define OSPLAT "OSPLAT=X86_64"
|
#define OSPLAT "OSPLAT=X86_64"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#if defined( __sparc__ ) || \
|
#if defined( __sparc__ ) || \
|
||||||
defined( __sparc )
|
defined( __sparc )
|
||||||
#define OSPLAT "OSPLAT=SPARC"
|
#define OSPLAT "OSPLAT=SPARC"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __mips__
|
#ifdef __mips__
|
||||||
#define OSPLAT "OSPLAT=MIPS"
|
#define OSPLAT "OSPLAT=MIPS"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __arm__
|
#ifdef __arm__
|
||||||
#define OSPLAT "OSPLAT=ARM"
|
#define OSPLAT "OSPLAT=ARM"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __s390__
|
#ifdef __s390__
|
||||||
#define OSPLAT "OSPLAT=390"
|
#define OSPLAT "OSPLAT=390"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __hppa
|
#ifdef __hppa
|
||||||
#define OSPLAT "OSPLAT=PARISC"
|
#define OSPLAT "OSPLAT=PARISC"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef OSPLAT
|
#ifndef OSPLAT
|
||||||
#define OSPLAT ""
|
#define OSPLAT ""
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -501,16 +501,16 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef MAXLINE
|
#ifndef MAXLINE
|
||||||
#define MAXLINE 102400 /* longest 'together' actions' */
|
#define MAXLINE 102400 /* longest 'together' actions' */
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef EXITOK
|
#ifndef EXITOK
|
||||||
#define EXITOK 0
|
#define EXITOK 0
|
||||||
#define EXITBAD 1
|
#define EXITBAD 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef SPLITPATH
|
#ifndef SPLITPATH
|
||||||
#define SPLITPATH ':'
|
#define SPLITPATH ':'
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* You probably do not need to muck with these. */
|
/* You probably do not need to muck with these. */
|
||||||
@ -526,19 +526,18 @@
|
|||||||
#define DEBUG_MAX 14
|
#define DEBUG_MAX 14
|
||||||
|
|
||||||
|
|
||||||
struct globs
|
struct globs {
|
||||||
{
|
int noexec;
|
||||||
int noexec;
|
int jobs;
|
||||||
int jobs;
|
int quitquick;
|
||||||
int quitquick;
|
int newestfirst; /* build newest sources first */
|
||||||
int newestfirst; /* build newest sources first */
|
int pipe_action;
|
||||||
int pipe_action;
|
char debug[ DEBUG_MAX ];
|
||||||
char debug[ DEBUG_MAX ];
|
FILE * cmdout; /* print cmds, not run them */
|
||||||
FILE * cmdout; /* print cmds, not run them */
|
long timeout; /* number of seconds to limit actions to,
|
||||||
long timeout; /* number of seconds to limit actions to,
|
|
||||||
* default 0 for no limit.
|
* default 0 for no limit.
|
||||||
*/
|
*/
|
||||||
int dart; /* output build and test results formatted for Dart */
|
int dart; /* output build and test results formatted for Dart */
|
||||||
};
|
};
|
||||||
|
|
||||||
extern struct globs globs;
|
extern struct globs globs;
|
||||||
|
@ -26,56 +26,56 @@
|
|||||||
/* Tokens. */
|
/* Tokens. */
|
||||||
#ifndef YYTOKENTYPE
|
#ifndef YYTOKENTYPE
|
||||||
# define YYTOKENTYPE
|
# define YYTOKENTYPE
|
||||||
/* Put the tokens into the symbol table, so that GDB and other debuggers
|
/* Put the tokens into the symbol table, so that GDB and other debuggers
|
||||||
know about them. */
|
know about them. */
|
||||||
enum yytokentype {
|
enum yytokentype {
|
||||||
_BANG_t = 258,
|
_BANG_t = 258,
|
||||||
_BANG_EQUALS_t = 259,
|
_BANG_EQUALS_t = 259,
|
||||||
_AMPER_t = 260,
|
_AMPER_t = 260,
|
||||||
_AMPERAMPER_t = 261,
|
_AMPERAMPER_t = 261,
|
||||||
_LPAREN_t = 262,
|
_LPAREN_t = 262,
|
||||||
_RPAREN_t = 263,
|
_RPAREN_t = 263,
|
||||||
_PLUS_EQUALS_t = 264,
|
_PLUS_EQUALS_t = 264,
|
||||||
_COLON_t = 265,
|
_COLON_t = 265,
|
||||||
_SEMIC_t = 266,
|
_SEMIC_t = 266,
|
||||||
_LANGLE_t = 267,
|
_LANGLE_t = 267,
|
||||||
_LANGLE_EQUALS_t = 268,
|
_LANGLE_EQUALS_t = 268,
|
||||||
_EQUALS_t = 269,
|
_EQUALS_t = 269,
|
||||||
_RANGLE_t = 270,
|
_RANGLE_t = 270,
|
||||||
_RANGLE_EQUALS_t = 271,
|
_RANGLE_EQUALS_t = 271,
|
||||||
_QUESTION_EQUALS_t = 272,
|
_QUESTION_EQUALS_t = 272,
|
||||||
_LBRACKET_t = 273,
|
_LBRACKET_t = 273,
|
||||||
_RBRACKET_t = 274,
|
_RBRACKET_t = 274,
|
||||||
ACTIONS_t = 275,
|
ACTIONS_t = 275,
|
||||||
BIND_t = 276,
|
BIND_t = 276,
|
||||||
CASE_t = 277,
|
CASE_t = 277,
|
||||||
CLASS_t = 278,
|
CLASS_t = 278,
|
||||||
DEFAULT_t = 279,
|
DEFAULT_t = 279,
|
||||||
ELSE_t = 280,
|
ELSE_t = 280,
|
||||||
EXISTING_t = 281,
|
EXISTING_t = 281,
|
||||||
FOR_t = 282,
|
FOR_t = 282,
|
||||||
IF_t = 283,
|
IF_t = 283,
|
||||||
IGNORE_t = 284,
|
IGNORE_t = 284,
|
||||||
IN_t = 285,
|
IN_t = 285,
|
||||||
INCLUDE_t = 286,
|
INCLUDE_t = 286,
|
||||||
LOCAL_t = 287,
|
LOCAL_t = 287,
|
||||||
MODULE_t = 288,
|
MODULE_t = 288,
|
||||||
ON_t = 289,
|
ON_t = 289,
|
||||||
PIECEMEAL_t = 290,
|
PIECEMEAL_t = 290,
|
||||||
QUIETLY_t = 291,
|
QUIETLY_t = 291,
|
||||||
RETURN_t = 292,
|
RETURN_t = 292,
|
||||||
RULE_t = 293,
|
RULE_t = 293,
|
||||||
SWITCH_t = 294,
|
SWITCH_t = 294,
|
||||||
TOGETHER_t = 295,
|
TOGETHER_t = 295,
|
||||||
UPDATED_t = 296,
|
UPDATED_t = 296,
|
||||||
WHILE_t = 297,
|
WHILE_t = 297,
|
||||||
_LBRACE_t = 298,
|
_LBRACE_t = 298,
|
||||||
_BAR_t = 299,
|
_BAR_t = 299,
|
||||||
_BARBAR_t = 300,
|
_BARBAR_t = 300,
|
||||||
_RBRACE_t = 301,
|
_RBRACE_t = 301,
|
||||||
ARG = 302,
|
ARG = 302,
|
||||||
STRING = 303
|
STRING = 303
|
||||||
};
|
};
|
||||||
#endif
|
#endif
|
||||||
#define _BANG_t 258
|
#define _BANG_t 258
|
||||||
#define _BANG_EQUALS_t 259
|
#define _BANG_EQUALS_t 259
|
||||||
|
@ -1,44 +1,44 @@
|
|||||||
{ "!", _BANG_t },
|
{ "!", _BANG_t },
|
||||||
{ "!=", _BANG_EQUALS_t },
|
{ "!=", _BANG_EQUALS_t },
|
||||||
{ "&", _AMPER_t },
|
{ "&", _AMPER_t },
|
||||||
{ "&&", _AMPERAMPER_t },
|
{ "&&", _AMPERAMPER_t },
|
||||||
{ "(", _LPAREN_t },
|
{ "(", _LPAREN_t },
|
||||||
{ ")", _RPAREN_t },
|
{ ")", _RPAREN_t },
|
||||||
{ "+=", _PLUS_EQUALS_t },
|
{ "+=", _PLUS_EQUALS_t },
|
||||||
{ ":", _COLON_t },
|
{ ":", _COLON_t },
|
||||||
{ ";", _SEMIC_t },
|
{ ";", _SEMIC_t },
|
||||||
{ "<", _LANGLE_t },
|
{ "<", _LANGLE_t },
|
||||||
{ "<=", _LANGLE_EQUALS_t },
|
{ "<=", _LANGLE_EQUALS_t },
|
||||||
{ "=", _EQUALS_t },
|
{ "=", _EQUALS_t },
|
||||||
{ ">", _RANGLE_t },
|
{ ">", _RANGLE_t },
|
||||||
{ ">=", _RANGLE_EQUALS_t },
|
{ ">=", _RANGLE_EQUALS_t },
|
||||||
{ "?=", _QUESTION_EQUALS_t },
|
{ "?=", _QUESTION_EQUALS_t },
|
||||||
{ "[", _LBRACKET_t },
|
{ "[", _LBRACKET_t },
|
||||||
{ "]", _RBRACKET_t },
|
{ "]", _RBRACKET_t },
|
||||||
{ "actions", ACTIONS_t },
|
{ "actions", ACTIONS_t },
|
||||||
{ "bind", BIND_t },
|
{ "bind", BIND_t },
|
||||||
{ "case", CASE_t },
|
{ "case", CASE_t },
|
||||||
{ "class", CLASS_t },
|
{ "class", CLASS_t },
|
||||||
{ "default", DEFAULT_t },
|
{ "default", DEFAULT_t },
|
||||||
{ "else", ELSE_t },
|
{ "else", ELSE_t },
|
||||||
{ "existing", EXISTING_t },
|
{ "existing", EXISTING_t },
|
||||||
{ "for", FOR_t },
|
{ "for", FOR_t },
|
||||||
{ "if", IF_t },
|
{ "if", IF_t },
|
||||||
{ "ignore", IGNORE_t },
|
{ "ignore", IGNORE_t },
|
||||||
{ "in", IN_t },
|
{ "in", IN_t },
|
||||||
{ "include", INCLUDE_t },
|
{ "include", INCLUDE_t },
|
||||||
{ "local", LOCAL_t },
|
{ "local", LOCAL_t },
|
||||||
{ "module", MODULE_t },
|
{ "module", MODULE_t },
|
||||||
{ "on", ON_t },
|
{ "on", ON_t },
|
||||||
{ "piecemeal", PIECEMEAL_t },
|
{ "piecemeal", PIECEMEAL_t },
|
||||||
{ "quietly", QUIETLY_t },
|
{ "quietly", QUIETLY_t },
|
||||||
{ "return", RETURN_t },
|
{ "return", RETURN_t },
|
||||||
{ "rule", RULE_t },
|
{ "rule", RULE_t },
|
||||||
{ "switch", SWITCH_t },
|
{ "switch", SWITCH_t },
|
||||||
{ "together", TOGETHER_t },
|
{ "together", TOGETHER_t },
|
||||||
{ "updated", UPDATED_t },
|
{ "updated", UPDATED_t },
|
||||||
{ "while", WHILE_t },
|
{ "while", WHILE_t },
|
||||||
{ "{", _LBRACE_t },
|
{ "{", _LBRACE_t },
|
||||||
{ "|", _BAR_t },
|
{ "|", _BAR_t },
|
||||||
{ "||", _BARBAR_t },
|
{ "||", _BARBAR_t },
|
||||||
{ "}", _RBRACE_t },
|
{ "}", _RBRACE_t },
|
||||||
|
@ -56,9 +56,9 @@
|
|||||||
typedef struct _list LIST;
|
typedef struct _list LIST;
|
||||||
|
|
||||||
struct _list {
|
struct _list {
|
||||||
LIST *next;
|
LIST *next;
|
||||||
LIST *tail; /* only valid in head node */
|
LIST *tail; /* only valid in head node */
|
||||||
char *string; /* private copy */
|
char *string; /* private copy */
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -70,8 +70,8 @@ typedef struct _lol LOL;
|
|||||||
# define LOL_MAX 19
|
# define LOL_MAX 19
|
||||||
|
|
||||||
struct _lol {
|
struct _lol {
|
||||||
int count;
|
int count;
|
||||||
LIST *list[ LOL_MAX ];
|
LIST *list[ LOL_MAX ];
|
||||||
};
|
};
|
||||||
|
|
||||||
LIST * list_append( LIST *l, LIST *nl );
|
LIST * list_append( LIST *l, LIST *nl );
|
||||||
|
@ -14,12 +14,12 @@ int make( int n_targets, const char **targets, int anyhow );
|
|||||||
int make1( TARGET *t );
|
int make1( TARGET *t );
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int temp;
|
int temp;
|
||||||
int updating;
|
int updating;
|
||||||
int cantfind;
|
int cantfind;
|
||||||
int cantmake;
|
int cantmake;
|
||||||
int targets;
|
int targets;
|
||||||
int made;
|
int made;
|
||||||
} COUNTS ;
|
} COUNTS ;
|
||||||
|
|
||||||
|
|
||||||
|
@ -65,24 +65,24 @@ typedef unsigned int md5_word_t; /* 32-bit word */
|
|||||||
|
|
||||||
/* Define the state of the MD5 Algorithm. */
|
/* Define the state of the MD5 Algorithm. */
|
||||||
typedef struct md5_state_s {
|
typedef struct md5_state_s {
|
||||||
md5_word_t count[2]; /* message length in bits, lsw first */
|
md5_word_t count[2]; /* message length in bits, lsw first */
|
||||||
md5_word_t abcd[4]; /* digest buffer */
|
md5_word_t abcd[4]; /* digest buffer */
|
||||||
md5_byte_t buf[64]; /* accumulate block */
|
md5_byte_t buf[64]; /* accumulate block */
|
||||||
} md5_state_t;
|
} md5_state_t;
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C"
|
extern "C"
|
||||||
{
|
{
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Initialize the algorithm. */
|
/* Initialize the algorithm. */
|
||||||
void md5_init(md5_state_t *pms);
|
void md5_init(md5_state_t *pms);
|
||||||
|
|
||||||
/* Append a string to the message. */
|
/* Append a string to the message. */
|
||||||
void md5_append(md5_state_t *pms, const md5_byte_t *data, int nbytes);
|
void md5_append(md5_state_t *pms, const md5_byte_t *data, int nbytes);
|
||||||
|
|
||||||
/* Finish the message and return the digest. */
|
/* Finish the message and return the digest. */
|
||||||
void md5_finish(md5_state_t *pms, md5_byte_t digest[16]);
|
void md5_finish(md5_state_t *pms, md5_byte_t digest[16]);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
} /* end extern "C" */
|
} /* end extern "C" */
|
||||||
|
@ -11,122 +11,122 @@ http://www.boost.org/LICENSE_1_0.txt)
|
|||||||
|
|
||||||
#ifdef OPT_BOEHM_GC
|
#ifdef OPT_BOEHM_GC
|
||||||
|
|
||||||
/* Use Boehm GC memory allocator. */
|
/* Use Boehm GC memory allocator. */
|
||||||
#include <gc.h>
|
#include <gc.h>
|
||||||
#define bjam_malloc_x(s) memset(GC_malloc(s),0,s)
|
#define bjam_malloc_x(s) memset(GC_malloc(s),0,s)
|
||||||
#define bjam_malloc_atomic_x(s) memset(GC_malloc_atomic(s),0,s)
|
#define bjam_malloc_atomic_x(s) memset(GC_malloc_atomic(s),0,s)
|
||||||
#define bjam_calloc_x(n,s) memset(GC_malloc((n)*(s)),0,(n)*(s))
|
#define bjam_calloc_x(n,s) memset(GC_malloc((n)*(s)),0,(n)*(s))
|
||||||
#define bjam_calloc_atomic_x(n,s) memset(GC_malloc_atomic((n)*(s)),0,(n)*(s))
|
#define bjam_calloc_atomic_x(n,s) memset(GC_malloc_atomic((n)*(s)),0,(n)*(s))
|
||||||
#define bjam_realloc_x(p,s) GC_realloc(p,s)
|
#define bjam_realloc_x(p,s) GC_realloc(p,s)
|
||||||
#define bjam_free_x(p) GC_free(p)
|
#define bjam_free_x(p) GC_free(p)
|
||||||
#define bjam_mem_init_x() GC_init(); GC_enable_incremental()
|
#define bjam_mem_init_x() GC_init(); GC_enable_incremental()
|
||||||
|
|
||||||
#define bjam_malloc_raw_x(s) malloc(s)
|
#define bjam_malloc_raw_x(s) malloc(s)
|
||||||
#define bjam_calloc_raw_x(n,s) calloc(n,s)
|
#define bjam_calloc_raw_x(n,s) calloc(n,s)
|
||||||
#define bjam_realloc_raw_x(p,s) realloc(p,s)
|
#define bjam_realloc_raw_x(p,s) realloc(p,s)
|
||||||
#define bjam_free_raw_x(p) free(p)
|
#define bjam_free_raw_x(p) free(p)
|
||||||
|
|
||||||
#ifndef BJAM_NEWSTR_NO_ALLOCATE
|
#ifndef BJAM_NEWSTR_NO_ALLOCATE
|
||||||
#define BJAM_NEWSTR_NO_ALLOCATE
|
#define BJAM_NEWSTR_NO_ALLOCATE
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#elif defined(OPT_DUMA)
|
#elif defined(OPT_DUMA)
|
||||||
|
|
||||||
/* Use Duma memory debugging library. */
|
/* Use Duma memory debugging library. */
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#define _DUMA_CONFIG_H_
|
#define _DUMA_CONFIG_H_
|
||||||
#define DUMA_NO_GLOBAL_MALLOC_FREE
|
#define DUMA_NO_GLOBAL_MALLOC_FREE
|
||||||
#define DUMA_EXPLICIT_INIT
|
#define DUMA_EXPLICIT_INIT
|
||||||
#define DUMA_NO_THREAD_SAFETY
|
#define DUMA_NO_THREAD_SAFETY
|
||||||
#define DUMA_NO_CPP_SUPPORT
|
#define DUMA_NO_CPP_SUPPORT
|
||||||
/* #define DUMA_NO_LEAKDETECTION */
|
/* #define DUMA_NO_LEAKDETECTION */
|
||||||
/* #define DUMA_USE_FRAMENO */
|
/* #define DUMA_USE_FRAMENO */
|
||||||
/* #define DUMA_PREFER_ATEXIT */
|
/* #define DUMA_PREFER_ATEXIT */
|
||||||
/* #define DUMA_OLD_DEL_MACRO */
|
/* #define DUMA_OLD_DEL_MACRO */
|
||||||
/* #define DUMA_NO_HANG_MSG */
|
/* #define DUMA_NO_HANG_MSG */
|
||||||
#define DUMA_PAGE_SIZE 4096
|
#define DUMA_PAGE_SIZE 4096
|
||||||
#define DUMA_MIN_ALIGNMENT 1
|
#define DUMA_MIN_ALIGNMENT 1
|
||||||
/* #define DUMA_GNU_INIT_ATTR 0 */
|
/* #define DUMA_GNU_INIT_ATTR 0 */
|
||||||
typedef unsigned int DUMA_ADDR;
|
typedef unsigned int DUMA_ADDR;
|
||||||
typedef unsigned int DUMA_SIZE;
|
typedef unsigned int DUMA_SIZE;
|
||||||
#include <duma.h>
|
#include <duma.h>
|
||||||
#define bjam_malloc_x(s) malloc(s)
|
#define bjam_malloc_x(s) malloc(s)
|
||||||
#define bjam_calloc_x(n,s) calloc(n,s)
|
#define bjam_calloc_x(n,s) calloc(n,s)
|
||||||
#define bjam_realloc_x(p,s) realloc(p,s)
|
#define bjam_realloc_x(p,s) realloc(p,s)
|
||||||
#define bjam_free_x(p) free(p)
|
#define bjam_free_x(p) free(p)
|
||||||
|
|
||||||
#ifndef BJAM_NEWSTR_NO_ALLOCATE
|
#ifndef BJAM_NEWSTR_NO_ALLOCATE
|
||||||
#define BJAM_NEWSTR_NO_ALLOCATE
|
#define BJAM_NEWSTR_NO_ALLOCATE
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
/* Standard C memory allocation. */
|
/* Standard C memory allocation. */
|
||||||
#define bjam_malloc_x(s) malloc(s)
|
#define bjam_malloc_x(s) malloc(s)
|
||||||
#define bjam_calloc_x(n,s) calloc(n,s)
|
#define bjam_calloc_x(n,s) calloc(n,s)
|
||||||
#define bjam_realloc_x(p,s) realloc(p,s)
|
#define bjam_realloc_x(p,s) realloc(p,s)
|
||||||
#define bjam_free_x(p) free(p)
|
#define bjam_free_x(p) free(p)
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef bjam_malloc_atomic_x
|
#ifndef bjam_malloc_atomic_x
|
||||||
#define bjam_malloc_atomic_x(s) bjam_malloc_x(s)
|
#define bjam_malloc_atomic_x(s) bjam_malloc_x(s)
|
||||||
#endif
|
#endif
|
||||||
#ifndef bjam_calloc_atomic_x
|
#ifndef bjam_calloc_atomic_x
|
||||||
#define bjam_calloc_atomic_x(n,s) bjam_calloc_x(n,s)
|
#define bjam_calloc_atomic_x(n,s) bjam_calloc_x(n,s)
|
||||||
#endif
|
#endif
|
||||||
#ifndef bjam_mem_init_x
|
#ifndef bjam_mem_init_x
|
||||||
#define bjam_mem_init_x()
|
#define bjam_mem_init_x()
|
||||||
#endif
|
#endif
|
||||||
#ifndef bjam_mem_close_x
|
#ifndef bjam_mem_close_x
|
||||||
#define bjam_mem_close_x()
|
#define bjam_mem_close_x()
|
||||||
#endif
|
#endif
|
||||||
#ifndef bjam_malloc_raw_x
|
#ifndef bjam_malloc_raw_x
|
||||||
#define bjam_malloc_raw_x(s) bjam_malloc_x(s)
|
#define bjam_malloc_raw_x(s) bjam_malloc_x(s)
|
||||||
#endif
|
#endif
|
||||||
#ifndef bjam_calloc_raw_x
|
#ifndef bjam_calloc_raw_x
|
||||||
#define bjam_calloc_raw_x(n,s) bjam_calloc_x(n,s)
|
#define bjam_calloc_raw_x(n,s) bjam_calloc_x(n,s)
|
||||||
#endif
|
#endif
|
||||||
#ifndef bjam_realloc_raw_x
|
#ifndef bjam_realloc_raw_x
|
||||||
#define bjam_realloc_raw_x(p,s) bjam_realloc_x(p,s)
|
#define bjam_realloc_raw_x(p,s) bjam_realloc_x(p,s)
|
||||||
#endif
|
#endif
|
||||||
#ifndef bjam_free_raw_x
|
#ifndef bjam_free_raw_x
|
||||||
#define bjam_free_raw_x(p) bjam_free_x(p)
|
#define bjam_free_raw_x(p) bjam_free_x(p)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef OPT_DEBUG_PROFILE
|
#ifdef OPT_DEBUG_PROFILE
|
||||||
|
|
||||||
/* Profile tracing of memory allocations. */
|
/* Profile tracing of memory allocations. */
|
||||||
#define BJAM_MALLOC(s) (profile_memory(s), bjam_malloc_x(s))
|
#define BJAM_MALLOC(s) (profile_memory(s), bjam_malloc_x(s))
|
||||||
#define BJAM_MALLOC_ATOMIC(s) (profile_memory(s), bjam_malloc_atomic_x(s))
|
#define BJAM_MALLOC_ATOMIC(s) (profile_memory(s), bjam_malloc_atomic_x(s))
|
||||||
#define BJAM_CALLOC(n,s) (profile_memory(n*s), bjam_calloc_x(n,s))
|
#define BJAM_CALLOC(n,s) (profile_memory(n*s), bjam_calloc_x(n,s))
|
||||||
#define BJAM_CALLOC_ATOMIC(n,s) (profile_memory(n*s), bjam_calloc_atomic_x(n,s))
|
#define BJAM_CALLOC_ATOMIC(n,s) (profile_memory(n*s), bjam_calloc_atomic_x(n,s))
|
||||||
#define BJAM_REALLOC(p,s) (profile_memory(s), bjam_realloc_x(p,s))
|
#define BJAM_REALLOC(p,s) (profile_memory(s), bjam_realloc_x(p,s))
|
||||||
#define BJAM_FREE(p) bjam_free_x(p)
|
#define BJAM_FREE(p) bjam_free_x(p)
|
||||||
#define BJAM_MEM_INIT() bjam_mem_init_x()
|
#define BJAM_MEM_INIT() bjam_mem_init_x()
|
||||||
#define BJAM_MEM_CLOSE() bjam_mem_close_x()
|
#define BJAM_MEM_CLOSE() bjam_mem_close_x()
|
||||||
|
|
||||||
#define BJAM_MALLOC_RAW(s) (profile_memory(s), bjam_malloc_raw_x(s))
|
#define BJAM_MALLOC_RAW(s) (profile_memory(s), bjam_malloc_raw_x(s))
|
||||||
#define BJAM_CALLOC_RAW(n,s) (profile_memory(n*s), bjam_calloc_raw_x(n,s))
|
#define BJAM_CALLOC_RAW(n,s) (profile_memory(n*s), bjam_calloc_raw_x(n,s))
|
||||||
#define BJAM_REALLOC_RAW(p,s) (profile_memory(s), bjam_realloc_raw_x(p,s))
|
#define BJAM_REALLOC_RAW(p,s) (profile_memory(s), bjam_realloc_raw_x(p,s))
|
||||||
#define BJAM_FREE_RAW(p) bjam_free_raw_x(p)
|
#define BJAM_FREE_RAW(p) bjam_free_raw_x(p)
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
/* No mem tracing. */
|
/* No mem tracing. */
|
||||||
#define BJAM_MALLOC(s) bjam_malloc_x(s)
|
#define BJAM_MALLOC(s) bjam_malloc_x(s)
|
||||||
#define BJAM_MALLOC_ATOMIC(s) bjam_malloc_atomic_x(s)
|
#define BJAM_MALLOC_ATOMIC(s) bjam_malloc_atomic_x(s)
|
||||||
#define BJAM_CALLOC(n,s) bjam_calloc_x(n,s)
|
#define BJAM_CALLOC(n,s) bjam_calloc_x(n,s)
|
||||||
#define BJAM_CALLOC_ATOMIC(n,s) bjam_calloc_atomic_x(n,s)
|
#define BJAM_CALLOC_ATOMIC(n,s) bjam_calloc_atomic_x(n,s)
|
||||||
#define BJAM_REALLOC(p,s) bjam_realloc_x(p,s)
|
#define BJAM_REALLOC(p,s) bjam_realloc_x(p,s)
|
||||||
#define BJAM_FREE(p) bjam_free_x(p)
|
#define BJAM_FREE(p) bjam_free_x(p)
|
||||||
#define BJAM_MEM_INIT() bjam_mem_init_x()
|
#define BJAM_MEM_INIT() bjam_mem_init_x()
|
||||||
#define BJAM_MEM_CLOSE() bjam_mem_close_x()
|
#define BJAM_MEM_CLOSE() bjam_mem_close_x()
|
||||||
|
|
||||||
#define BJAM_MALLOC_RAW(s) bjam_malloc_raw_x(s)
|
#define BJAM_MALLOC_RAW(s) bjam_malloc_raw_x(s)
|
||||||
#define BJAM_CALLOC_RAW(n,s) bjam_calloc_raw_x(n,s)
|
#define BJAM_CALLOC_RAW(n,s) bjam_calloc_raw_x(n,s)
|
||||||
#define BJAM_REALLOC_RAW(p,s) bjam_realloc_raw_x(p,s)
|
#define BJAM_REALLOC_RAW(p,s) bjam_realloc_raw_x(p,s)
|
||||||
#define BJAM_FREE_RAW(p) bjam_free_raw_x(p)
|
#define BJAM_FREE_RAW(p) bjam_free_raw_x(p)
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -8,15 +8,14 @@
|
|||||||
|
|
||||||
#include "lists.h"
|
#include "lists.h"
|
||||||
|
|
||||||
struct module_t
|
struct module_t {
|
||||||
{
|
char* name;
|
||||||
char* name;
|
struct hash* rules;
|
||||||
struct hash* rules;
|
struct hash* variables;
|
||||||
struct hash* variables;
|
struct hash* imported_modules;
|
||||||
struct hash* imported_modules;
|
struct module_t* class_module;
|
||||||
struct module_t* class_module;
|
struct hash* native_rules;
|
||||||
struct hash* native_rules;
|
int user_module;
|
||||||
int user_module;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef struct module_t module_t ; /* MSVC debugger gets confused unless this is provided */
|
typedef struct module_t module_t ; /* MSVC debugger gets confused unless this is provided */
|
||||||
|
@ -7,20 +7,19 @@
|
|||||||
|
|
||||||
#include "rules.h"
|
#include "rules.h"
|
||||||
|
|
||||||
struct native_rule_t
|
struct native_rule_t {
|
||||||
{
|
char* name;
|
||||||
char* name;
|
argument_list* arguments;
|
||||||
argument_list* arguments;
|
PARSE* procedure;
|
||||||
PARSE* procedure;
|
/* Version of the interface that the native rule provides.
|
||||||
/* Version of the interface that the native rule provides.
|
It's possible that we want to change the set parameter
|
||||||
It's possible that we want to change the set parameter
|
for existing native rule. In that case, version number
|
||||||
for existing native rule. In that case, version number
|
should be incremented so that Boost.Build can check for
|
||||||
should be incremented so that Boost.Build can check for
|
version it relies on.
|
||||||
version it relies on.
|
|
||||||
|
|
||||||
Versions are numbered from 1.
|
Versions are numbered from 1.
|
||||||
*/
|
*/
|
||||||
int version;
|
int version;
|
||||||
};
|
};
|
||||||
|
|
||||||
/* MSVC debugger gets confused unless this is provided */
|
/* MSVC debugger gets confused unless this is provided */
|
||||||
|
@ -11,10 +11,9 @@
|
|||||||
* \ -) "Command line option."
|
* \ -) "Command line option."
|
||||||
*/
|
*/
|
||||||
|
|
||||||
typedef struct bjam_option
|
typedef struct bjam_option {
|
||||||
{
|
char flag; /* filled in by getoption() */
|
||||||
char flag; /* filled in by getoption() */
|
char *val; /* set to random address if true */
|
||||||
char *val; /* set to random address if true */
|
|
||||||
} bjam_option;
|
} bjam_option;
|
||||||
|
|
||||||
# define N_OPTS 256
|
# define N_OPTS 256
|
||||||
|
@ -14,13 +14,13 @@
|
|||||||
#define EXIT_TIMEOUT 2
|
#define EXIT_TIMEOUT 2
|
||||||
|
|
||||||
void out_action(
|
void out_action(
|
||||||
const char * action,
|
const char * action,
|
||||||
const char * target,
|
const char * target,
|
||||||
const char * command,
|
const char * command,
|
||||||
const char * out_data,
|
const char * out_data,
|
||||||
const char * err_data,
|
const char * err_data,
|
||||||
int exit_reason
|
int exit_reason
|
||||||
);
|
);
|
||||||
|
|
||||||
char * outf_int( int value );
|
char * outf_int( int value );
|
||||||
char * outf_double( double value );
|
char * outf_double( double value );
|
||||||
|
@ -26,31 +26,31 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
struct _PARSE {
|
struct _PARSE {
|
||||||
LIST * (* func)( PARSE *, FRAME * );
|
LIST * (* func)( PARSE *, FRAME * );
|
||||||
PARSE * left;
|
PARSE * left;
|
||||||
PARSE * right;
|
PARSE * right;
|
||||||
PARSE * third;
|
PARSE * third;
|
||||||
char * string;
|
char * string;
|
||||||
char * string1;
|
char * string1;
|
||||||
int num;
|
int num;
|
||||||
int refs;
|
int refs;
|
||||||
/* module * module; */
|
/* module * module; */
|
||||||
char * rulename;
|
char * rulename;
|
||||||
char * file;
|
char * file;
|
||||||
int line;
|
int line;
|
||||||
};
|
};
|
||||||
|
|
||||||
void parse_file( char *, FRAME * );
|
void parse_file( char *, FRAME * );
|
||||||
void parse_save( PARSE * );
|
void parse_save( PARSE * );
|
||||||
|
|
||||||
PARSE * parse_make(
|
PARSE * parse_make(
|
||||||
LIST * (* func)( PARSE *, FRAME * ),
|
LIST * (* func)( PARSE *, FRAME * ),
|
||||||
PARSE * left,
|
PARSE * left,
|
||||||
PARSE * right,
|
PARSE * right,
|
||||||
PARSE * third,
|
PARSE * third,
|
||||||
char * string,
|
char * string,
|
||||||
char * string1,
|
char * string1,
|
||||||
int num );
|
int num );
|
||||||
|
|
||||||
void parse_refer ( PARSE * );
|
void parse_refer ( PARSE * );
|
||||||
void parse_free ( PARSE * );
|
void parse_free ( PARSE * );
|
||||||
|
@ -28,17 +28,15 @@
|
|||||||
typedef struct _pathname PATHNAME;
|
typedef struct _pathname PATHNAME;
|
||||||
typedef struct _pathpart PATHPART;
|
typedef struct _pathpart PATHPART;
|
||||||
|
|
||||||
struct _pathpart
|
struct _pathpart {
|
||||||
{
|
char * ptr;
|
||||||
char * ptr;
|
int len;
|
||||||
int len;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct _pathname
|
struct _pathname {
|
||||||
{
|
PATHPART part[6];
|
||||||
PATHPART part[6];
|
|
||||||
#ifdef OS_VMS
|
#ifdef OS_VMS
|
||||||
int parent;
|
int parent;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define f_grist part[0]
|
#define f_grist part[0]
|
||||||
|
@ -9,13 +9,13 @@
|
|||||||
|
|
||||||
#define NSUBEXP 10
|
#define NSUBEXP 10
|
||||||
typedef struct regexp {
|
typedef struct regexp {
|
||||||
char *startp[NSUBEXP];
|
char *startp[NSUBEXP];
|
||||||
char *endp[NSUBEXP];
|
char *endp[NSUBEXP];
|
||||||
char regstart; /* Internal use only. */
|
char regstart; /* Internal use only. */
|
||||||
char reganch; /* Internal use only. */
|
char reganch; /* Internal use only. */
|
||||||
char *regmust; /* Internal use only. */
|
char *regmust; /* Internal use only. */
|
||||||
int regmlen; /* Internal use only. */
|
int regmlen; /* Internal use only. */
|
||||||
char program[1]; /* Unwarranted chumminess with compiler. */
|
char program[1]; /* Unwarranted chumminess with compiler. */
|
||||||
} regexp;
|
} regexp;
|
||||||
|
|
||||||
regexp *regcomp( char *exp );
|
regexp *regcomp( char *exp );
|
||||||
|
@ -53,19 +53,17 @@ typedef struct _settings SETTINGS ;
|
|||||||
/* RULE - a generic jam rule, the product of RULE and ACTIONS. */
|
/* RULE - a generic jam rule, the product of RULE and ACTIONS. */
|
||||||
|
|
||||||
/* A rule's argument list. */
|
/* A rule's argument list. */
|
||||||
struct argument_list
|
struct argument_list {
|
||||||
{
|
int reference_count;
|
||||||
int reference_count;
|
LOL data[1];
|
||||||
LOL data[1];
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Build actions corresponding to a rule. */
|
/* Build actions corresponding to a rule. */
|
||||||
struct rule_actions
|
struct rule_actions {
|
||||||
{
|
int reference_count;
|
||||||
int reference_count;
|
char * command; /* command string from ACTIONS */
|
||||||
char * command; /* command string from ACTIONS */
|
LIST * bindlist;
|
||||||
LIST * bindlist;
|
int flags; /* modifiers on ACTIONS */
|
||||||
int flags; /* modifiers on ACTIONS */
|
|
||||||
|
|
||||||
#define RULE_NEWSRCS 0x01 /* $(>) is updated sources only */
|
#define RULE_NEWSRCS 0x01 /* $(>) is updated sources only */
|
||||||
#define RULE_TOGETHER 0x02 /* combine actions on single target */
|
#define RULE_TOGETHER 0x02 /* combine actions on single target */
|
||||||
@ -78,67 +76,61 @@ struct rule_actions
|
|||||||
typedef struct rule_actions rule_actions;
|
typedef struct rule_actions rule_actions;
|
||||||
typedef struct argument_list argument_list;
|
typedef struct argument_list argument_list;
|
||||||
|
|
||||||
struct _rule
|
struct _rule {
|
||||||
{
|
char * name;
|
||||||
char * name;
|
PARSE * procedure; /* parse tree from RULE */
|
||||||
PARSE * procedure; /* parse tree from RULE */
|
argument_list * arguments; /* argument checking info, or NULL for unchecked
|
||||||
argument_list * arguments; /* argument checking info, or NULL for unchecked
|
|
||||||
*/
|
*/
|
||||||
rule_actions * actions; /* build actions, or NULL for no actions */
|
rule_actions * actions; /* build actions, or NULL for no actions */
|
||||||
module_t * module; /* module in which this rule is executed */
|
module_t * module; /* module in which this rule is executed */
|
||||||
int exported; /* nonzero if this rule is supposed to appear in
|
int exported; /* nonzero if this rule is supposed to appear in
|
||||||
* the global module and be automatically
|
* the global module and be automatically
|
||||||
* imported into other modules
|
* imported into other modules
|
||||||
*/
|
*/
|
||||||
#ifdef HAVE_PYTHON
|
#ifdef HAVE_PYTHON
|
||||||
PyObject * python_function;
|
PyObject * python_function;
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
/* ACTIONS - a chain of ACTIONs. */
|
/* ACTIONS - a chain of ACTIONs. */
|
||||||
struct _actions
|
struct _actions {
|
||||||
{
|
ACTIONS * next;
|
||||||
ACTIONS * next;
|
ACTIONS * tail; /* valid only for head */
|
||||||
ACTIONS * tail; /* valid only for head */
|
ACTION * action;
|
||||||
ACTION * action;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/* ACTION - a RULE instance with targets and sources. */
|
/* ACTION - a RULE instance with targets and sources. */
|
||||||
struct _action
|
struct _action {
|
||||||
{
|
RULE * rule;
|
||||||
RULE * rule;
|
TARGETS * targets;
|
||||||
TARGETS * targets;
|
TARGETS * sources; /* aka $(>) */
|
||||||
TARGETS * sources; /* aka $(>) */
|
char running; /* has been started */
|
||||||
char running; /* has been started */
|
char status; /* see TARGET status */
|
||||||
char status; /* see TARGET status */
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/* SETTINGS - variables to set when executing a TARGET's ACTIONS. */
|
/* SETTINGS - variables to set when executing a TARGET's ACTIONS. */
|
||||||
struct _settings
|
struct _settings {
|
||||||
{
|
SETTINGS * next;
|
||||||
SETTINGS * next;
|
char * symbol; /* symbol name for var_set() */
|
||||||
char * symbol; /* symbol name for var_set() */
|
LIST * value; /* symbol value for var_set() */
|
||||||
LIST * value; /* symbol value for var_set() */
|
int multiple;
|
||||||
int multiple;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/* TARGETS - a chain of TARGETs. */
|
/* TARGETS - a chain of TARGETs. */
|
||||||
struct _targets
|
struct _targets {
|
||||||
{
|
TARGETS * next;
|
||||||
TARGETS * next;
|
TARGETS * tail; /* valid only for head */
|
||||||
TARGETS * tail; /* valid only for head */
|
TARGET * target;
|
||||||
TARGET * target;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/* TARGET - an entity (e.g. a file) that can be built. */
|
/* TARGET - an entity (e.g. a file) that can be built. */
|
||||||
struct _target
|
struct _target {
|
||||||
{
|
char * name;
|
||||||
char * name;
|
char * boundname; /* if search() relocates target */
|
||||||
char * boundname; /* if search() relocates target */
|
ACTIONS * actions; /* rules to execute, if any */
|
||||||
ACTIONS * actions; /* rules to execute, if any */
|
SETTINGS * settings; /* variables to define */
|
||||||
SETTINGS * settings; /* variables to define */
|
|
||||||
|
|
||||||
short flags; /* status info */
|
short flags; /* status info */
|
||||||
|
|
||||||
#define T_FLAG_TEMP 0x0001 /* TEMPORARY applied */
|
#define T_FLAG_TEMP 0x0001 /* TEMPORARY applied */
|
||||||
#define T_FLAG_NOCARE 0x0002 /* NOCARE applied */
|
#define T_FLAG_NOCARE 0x0002 /* NOCARE applied */
|
||||||
@ -148,28 +140,28 @@ struct _target
|
|||||||
#define T_FLAG_NOUPDATE 0x0020 /* NOUPDATE applied */
|
#define T_FLAG_NOUPDATE 0x0020 /* NOUPDATE applied */
|
||||||
#define T_FLAG_VISITED 0x0040 /* CWM: Used in debugging */
|
#define T_FLAG_VISITED 0x0040 /* CWM: Used in debugging */
|
||||||
|
|
||||||
/* This flag has been added to support a new built-in rule named "RMBAD". It is
|
/* This flag has been added to support a new built-in rule named "RMBAD". It is
|
||||||
* used to force removal of outdated targets whose dependencies fail to build.
|
* used to force removal of outdated targets whose dependencies fail to build.
|
||||||
*/
|
*/
|
||||||
#define T_FLAG_RMOLD 0x0080 /* RMBAD applied */
|
#define T_FLAG_RMOLD 0x0080 /* RMBAD applied */
|
||||||
|
|
||||||
/* This flag was added to support a new built-in rule named "FAIL_EXPECTED" used
|
/* This flag was added to support a new built-in rule named "FAIL_EXPECTED" used
|
||||||
* to indicate that the result of running a given action should be inverted,
|
* to indicate that the result of running a given action should be inverted,
|
||||||
* i.e. ok <=> fail. This is useful for launching certain test runs from a
|
* i.e. ok <=> fail. This is useful for launching certain test runs from a
|
||||||
* Jamfile.
|
* Jamfile.
|
||||||
*/
|
*/
|
||||||
#define T_FLAG_FAIL_EXPECTED 0x0100 /* FAIL_EXPECTED applied */
|
#define T_FLAG_FAIL_EXPECTED 0x0100 /* FAIL_EXPECTED applied */
|
||||||
|
|
||||||
#define T_FLAG_INTERNAL 0x0200 /* internal INCLUDES node */
|
#define T_FLAG_INTERNAL 0x0200 /* internal INCLUDES node */
|
||||||
|
|
||||||
/* Indicates that the target must be a file. This prevents matching non-files,
|
/* Indicates that the target must be a file. This prevents matching non-files,
|
||||||
* like directories, when a target is searched.
|
* like directories, when a target is searched.
|
||||||
*/
|
*/
|
||||||
#define T_FLAG_ISFILE 0x0400
|
#define T_FLAG_ISFILE 0x0400
|
||||||
|
|
||||||
#define T_FLAG_PRECIOUS 0x0800
|
#define T_FLAG_PRECIOUS 0x0800
|
||||||
|
|
||||||
char binding; /* how target relates to a real file or
|
char binding; /* how target relates to a real file or
|
||||||
* folder
|
* folder
|
||||||
*/
|
*/
|
||||||
|
|
||||||
@ -178,32 +170,32 @@ struct _target
|
|||||||
#define T_BIND_PARENTS 2 /* using parent's timestamp */
|
#define T_BIND_PARENTS 2 /* using parent's timestamp */
|
||||||
#define T_BIND_EXISTS 3 /* real file, timestamp valid */
|
#define T_BIND_EXISTS 3 /* real file, timestamp valid */
|
||||||
|
|
||||||
TARGETS * depends; /* dependencies */
|
TARGETS * depends; /* dependencies */
|
||||||
TARGETS * dependants; /* the inverse of dependencies */
|
TARGETS * dependants; /* the inverse of dependencies */
|
||||||
TARGETS * rebuilds; /* targets that should be force-rebuilt
|
TARGETS * rebuilds; /* targets that should be force-rebuilt
|
||||||
* whenever this one is
|
* whenever this one is
|
||||||
*/
|
*/
|
||||||
TARGET * includes; /* internal includes node */
|
TARGET * includes; /* internal includes node */
|
||||||
TARGET * original_target; /* original_target->includes = this */
|
TARGET * original_target; /* original_target->includes = this */
|
||||||
char rescanned;
|
char rescanned;
|
||||||
|
|
||||||
time_t time; /* update time */
|
time_t time; /* update time */
|
||||||
time_t leaf; /* update time of leaf sources */
|
time_t leaf; /* update time of leaf sources */
|
||||||
|
|
||||||
char fate; /* make0()'s diagnosis */
|
char fate; /* make0()'s diagnosis */
|
||||||
|
|
||||||
#define T_FATE_INIT 0 /* nothing done to target */
|
#define T_FATE_INIT 0 /* nothing done to target */
|
||||||
#define T_FATE_MAKING 1 /* make0(target) on stack */
|
#define T_FATE_MAKING 1 /* make0(target) on stack */
|
||||||
|
|
||||||
#define T_FATE_STABLE 2 /* target did not need updating */
|
#define T_FATE_STABLE 2 /* target did not need updating */
|
||||||
#define T_FATE_NEWER 3 /* target newer than parent */
|
#define T_FATE_NEWER 3 /* target newer than parent */
|
||||||
|
|
||||||
#define T_FATE_SPOIL 4 /* >= SPOIL rebuilds parents */
|
#define T_FATE_SPOIL 4 /* >= SPOIL rebuilds parents */
|
||||||
#define T_FATE_ISTMP 4 /* unneeded temp target oddly present */
|
#define T_FATE_ISTMP 4 /* unneeded temp target oddly present */
|
||||||
|
|
||||||
#define T_FATE_BUILD 5 /* >= BUILD rebuilds target */
|
#define T_FATE_BUILD 5 /* >= BUILD rebuilds target */
|
||||||
#define T_FATE_TOUCHED 5 /* manually touched with -t */
|
#define T_FATE_TOUCHED 5 /* manually touched with -t */
|
||||||
#define T_FATE_REBUILD 6
|
#define T_FATE_REBUILD 6
|
||||||
#define T_FATE_MISSING 7 /* is missing, needs updating */
|
#define T_FATE_MISSING 7 /* is missing, needs updating */
|
||||||
#define T_FATE_NEEDTMP 8 /* missing temp that must be rebuild */
|
#define T_FATE_NEEDTMP 8 /* missing temp that must be rebuild */
|
||||||
#define T_FATE_OUTDATED 9 /* is out of date, needs updating */
|
#define T_FATE_OUTDATED 9 /* is out of date, needs updating */
|
||||||
@ -213,7 +205,7 @@ struct _target
|
|||||||
#define T_FATE_CANTFIND 11 /* no rules to make missing target */
|
#define T_FATE_CANTFIND 11 /* no rules to make missing target */
|
||||||
#define T_FATE_CANTMAKE 12 /* can not find dependencies */
|
#define T_FATE_CANTMAKE 12 /* can not find dependencies */
|
||||||
|
|
||||||
char progress; /* tracks make1() progress */
|
char progress; /* tracks make1() progress */
|
||||||
|
|
||||||
#define T_MAKE_INIT 0 /* make1(target) not yet called */
|
#define T_MAKE_INIT 0 /* make1(target) not yet called */
|
||||||
#define T_MAKE_ONSTACK 1 /* make1(target) on stack */
|
#define T_MAKE_ONSTACK 1 /* make1(target) on stack */
|
||||||
@ -222,20 +214,20 @@ struct _target
|
|||||||
#define T_MAKE_DONE 4 /* make1(target) done */
|
#define T_MAKE_DONE 4 /* make1(target) done */
|
||||||
|
|
||||||
#ifdef OPT_SEMAPHORE
|
#ifdef OPT_SEMAPHORE
|
||||||
#define T_MAKE_SEMAPHORE 5 /* Special target type for semaphores */
|
#define T_MAKE_SEMAPHORE 5 /* Special target type for semaphores */
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef OPT_SEMAPHORE
|
#ifdef OPT_SEMAPHORE
|
||||||
TARGET * semaphore; /* used in serialization */
|
TARGET * semaphore; /* used in serialization */
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
char status; /* exec_cmd() result */
|
char status; /* exec_cmd() result */
|
||||||
|
|
||||||
int asynccnt; /* child deps outstanding */
|
int asynccnt; /* child deps outstanding */
|
||||||
TARGETS * parents; /* used by make1() for completion */
|
TARGETS * parents; /* used by make1() for completion */
|
||||||
char * cmds; /* type-punned command list */
|
char * cmds; /* type-punned command list */
|
||||||
|
|
||||||
char * failed;
|
char * failed;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -29,15 +29,14 @@
|
|||||||
|
|
||||||
#define YYSTYPE YYSYMBOL
|
#define YYSTYPE YYSYMBOL
|
||||||
|
|
||||||
typedef struct _YYSTYPE
|
typedef struct _YYSTYPE {
|
||||||
{
|
int type;
|
||||||
int type;
|
char * string;
|
||||||
char * string;
|
PARSE * parse;
|
||||||
PARSE * parse;
|
LIST * list;
|
||||||
LIST * list;
|
int number;
|
||||||
int number;
|
char * file;
|
||||||
char * file;
|
int line;
|
||||||
int line;
|
|
||||||
} YYSTYPE;
|
} YYSTYPE;
|
||||||
|
|
||||||
extern YYSTYPE yylval;
|
extern YYSTYPE yylval;
|
||||||
|
@ -7,14 +7,13 @@
|
|||||||
|
|
||||||
# include <stddef.h>
|
# include <stddef.h>
|
||||||
|
|
||||||
typedef struct string
|
typedef struct string {
|
||||||
{
|
char* value;
|
||||||
char* value;
|
unsigned long size;
|
||||||
unsigned long size;
|
unsigned long capacity;
|
||||||
unsigned long capacity;
|
char opt[32];
|
||||||
char opt[32];
|
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
char magic[4];
|
char magic[4];
|
||||||
#endif
|
#endif
|
||||||
} string;
|
} string;
|
||||||
|
|
||||||
|
@ -50,10 +50,10 @@ Data::~Data() {
|
|||||||
//ADDED BY TS
|
//ADDED BY TS
|
||||||
void Data::remove_duplicates() {
|
void Data::remove_duplicates() {
|
||||||
|
|
||||||
uint nSentences = featdata->size();
|
size_t nSentences = featdata->size();
|
||||||
assert(scoredata->size() == nSentences);
|
assert(scoredata->size() == nSentences);
|
||||||
|
|
||||||
for (uint s=0; s < nSentences; s++) {
|
for (size_t s=0; s < nSentences; s++) {
|
||||||
|
|
||||||
FeatureArray& feat_array = featdata->get(s);
|
FeatureArray& feat_array = featdata->get(s);
|
||||||
ScoreArray& score_array = scoredata->get(s);
|
ScoreArray& score_array = scoredata->get(s);
|
||||||
@ -61,29 +61,29 @@ void Data::remove_duplicates() {
|
|||||||
assert(feat_array.size() == score_array.size());
|
assert(feat_array.size() == score_array.size());
|
||||||
|
|
||||||
//serves as a hash-map:
|
//serves as a hash-map:
|
||||||
std::map<double, std::vector<uint> > lookup;
|
std::map<double, std::vector<size_t> > lookup;
|
||||||
|
|
||||||
uint end_pos = feat_array.size() - 1;
|
size_t end_pos = feat_array.size() - 1;
|
||||||
|
|
||||||
uint nRemoved = 0;
|
size_t nRemoved = 0;
|
||||||
for (uint k=0; k <= end_pos; k++) {
|
for (size_t k=0; k <= end_pos; k++) {
|
||||||
|
|
||||||
const FeatureStats& cur_feats = feat_array.get(k);
|
const FeatureStats& cur_feats = feat_array.get(k);
|
||||||
|
|
||||||
double sum = 0.0;
|
double sum = 0.0;
|
||||||
for (uint l=0; l < cur_feats.size(); l++)
|
for (size_t l=0; l < cur_feats.size(); l++)
|
||||||
sum += cur_feats.get(l);
|
sum += cur_feats.get(l);
|
||||||
|
|
||||||
if (lookup.find(sum) != lookup.end()) {
|
if (lookup.find(sum) != lookup.end()) {
|
||||||
|
|
||||||
//std::cerr << "hit" << std::endl;
|
//std::cerr << "hit" << std::endl;
|
||||||
|
|
||||||
std::vector<uint>& cur_list = lookup[sum];
|
std::vector<size_t>& cur_list = lookup[sum];
|
||||||
|
|
||||||
uint l=0;
|
size_t l=0;
|
||||||
for (l=0; l < cur_list.size(); l++) {
|
for (l=0; l < cur_list.size(); l++) {
|
||||||
|
|
||||||
uint j=cur_list[l];
|
size_t j=cur_list[l];
|
||||||
|
|
||||||
if (cur_feats == feat_array.get(j)
|
if (cur_feats == feat_array.get(j)
|
||||||
&& score_array.get(k) == score_array.get(j)) {
|
&& score_array.get(k) == score_array.get(j)) {
|
||||||
|
@ -129,7 +129,8 @@ IOWrapper::~IOWrapper()
|
|||||||
delete m_singleBestOutputCollector;
|
delete m_singleBestOutputCollector;
|
||||||
}
|
}
|
||||||
|
|
||||||
void IOWrapper::ResetTranslationId() {
|
void IOWrapper::ResetTranslationId()
|
||||||
|
{
|
||||||
m_translationId = StaticData::Instance().GetStartTranslationId();
|
m_translationId = StaticData::Instance().GetStartTranslationId();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -369,18 +370,18 @@ void IOWrapper::OutputNBestList(const ChartTrellisPathList &nBestList, const Cha
|
|||||||
if (pds.size() > 0) {
|
if (pds.size() > 0) {
|
||||||
|
|
||||||
for( size_t i=0; i<pds.size(); i++ ) {
|
for( size_t i=0; i<pds.size(); i++ ) {
|
||||||
size_t pd_numinputscore = pds[i]->GetNumInputScores();
|
size_t pd_numinputscore = pds[i]->GetNumInputScores();
|
||||||
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( pds[i] );
|
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( pds[i] );
|
||||||
for (size_t j = 0; j<scores.size(); ++j){
|
for (size_t j = 0; j<scores.size(); ++j) {
|
||||||
|
|
||||||
if (labeledOutput && (i == 0) ){
|
if (labeledOutput && (i == 0) ) {
|
||||||
if ((j == 0) || (j == pd_numinputscore)){
|
if ((j == 0) || (j == pd_numinputscore)) {
|
||||||
lastName = pds[i]->GetScoreProducerWeightShortName(j);
|
lastName = pds[i]->GetScoreProducerWeightShortName(j);
|
||||||
out << " " << lastName << ":";
|
out << " " << lastName << ":";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
out << " " << scores[j];
|
out << " " << scores[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -394,18 +395,18 @@ void IOWrapper::OutputNBestList(const ChartTrellisPathList &nBestList, const Cha
|
|||||||
if (gds.size() > 0) {
|
if (gds.size() > 0) {
|
||||||
|
|
||||||
for( size_t i=0; i<gds.size(); i++ ) {
|
for( size_t i=0; i<gds.size(); i++ ) {
|
||||||
size_t pd_numinputscore = gds[i]->GetNumInputScores();
|
size_t pd_numinputscore = gds[i]->GetNumInputScores();
|
||||||
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( gds[i] );
|
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( gds[i] );
|
||||||
for (size_t j = 0; j<scores.size(); ++j){
|
for (size_t j = 0; j<scores.size(); ++j) {
|
||||||
|
|
||||||
if (labeledOutput && (i == 0) ){
|
if (labeledOutput && (i == 0) ) {
|
||||||
if ((j == 0) || (j == pd_numinputscore)){
|
if ((j == 0) || (j == pd_numinputscore)) {
|
||||||
lastName = gds[i]->GetScoreProducerWeightShortName(j);
|
lastName = gds[i]->GetScoreProducerWeightShortName(j);
|
||||||
out << " " << lastName << ":";
|
out << " " << lastName << ":";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
out << " " << scores[j];
|
out << " " << scores[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -210,13 +210,13 @@ void OutputAlignment(ostream &out, const AlignmentInfo &ai, size_t sourceOffset,
|
|||||||
{
|
{
|
||||||
typedef std::vector< const std::pair<size_t,size_t>* > AlignVec;
|
typedef std::vector< const std::pair<size_t,size_t>* > AlignVec;
|
||||||
AlignVec alignments = ai.GetSortedAlignments();
|
AlignVec alignments = ai.GetSortedAlignments();
|
||||||
|
|
||||||
AlignVec::const_iterator it;
|
AlignVec::const_iterator it;
|
||||||
for (it = alignments.begin(); it != alignments.end(); ++it) {
|
for (it = alignments.begin(); it != alignments.end(); ++it) {
|
||||||
const std::pair<size_t,size_t> &alignment = **it;
|
const std::pair<size_t,size_t> &alignment = **it;
|
||||||
out << alignment.first + sourceOffset << "-" << alignment.second + targetOffset << " ";
|
out << alignment.first + sourceOffset << "-" << alignment.second + targetOffset << " ";
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
|
void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
|
||||||
@ -227,7 +227,7 @@ void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
|
|||||||
const Hypothesis &edge = *edges[currEdge];
|
const Hypothesis &edge = *edges[currEdge];
|
||||||
const TargetPhrase &tp = edge.GetCurrTargetPhrase();
|
const TargetPhrase &tp = edge.GetCurrTargetPhrase();
|
||||||
size_t sourceOffset = edge.GetCurrSourceWordsRange().GetStartPos();
|
size_t sourceOffset = edge.GetCurrSourceWordsRange().GetStartPos();
|
||||||
|
|
||||||
OutputAlignment(out, tp.GetAlignmentInfo(), sourceOffset, targetOffset);
|
OutputAlignment(out, tp.GetAlignmentInfo(), sourceOffset, targetOffset);
|
||||||
|
|
||||||
targetOffset += tp.GetSize();
|
targetOffset += tp.GetSize();
|
||||||
@ -239,7 +239,7 @@ void OutputAlignment(OutputCollector* collector, size_t lineNo , const vector<co
|
|||||||
{
|
{
|
||||||
ostringstream out;
|
ostringstream out;
|
||||||
OutputAlignment(out, edges);
|
OutputAlignment(out, edges);
|
||||||
|
|
||||||
collector->Write(lineNo,out.str());
|
collector->Write(lineNo,out.str());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -412,18 +412,18 @@ void OutputNBest(std::ostream& out, const Moses::TrellisPathList &nBestList, con
|
|||||||
if (pds.size() > 0) {
|
if (pds.size() > 0) {
|
||||||
|
|
||||||
for( size_t i=0; i<pds.size(); i++ ) {
|
for( size_t i=0; i<pds.size(); i++ ) {
|
||||||
size_t pd_numinputscore = pds[i]->GetNumInputScores();
|
size_t pd_numinputscore = pds[i]->GetNumInputScores();
|
||||||
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( pds[i] );
|
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( pds[i] );
|
||||||
for (size_t j = 0; j<scores.size(); ++j){
|
for (size_t j = 0; j<scores.size(); ++j) {
|
||||||
|
|
||||||
if (labeledOutput && (i == 0) ){
|
if (labeledOutput && (i == 0) ) {
|
||||||
if ((j == 0) || (j == pd_numinputscore)){
|
if ((j == 0) || (j == pd_numinputscore)) {
|
||||||
lastName = pds[i]->GetScoreProducerWeightShortName(j);
|
lastName = pds[i]->GetScoreProducerWeightShortName(j);
|
||||||
out << " " << lastName << ":";
|
out << " " << lastName << ":";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
out << " " << scores[j];
|
out << " " << scores[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -432,18 +432,18 @@ void OutputNBest(std::ostream& out, const Moses::TrellisPathList &nBestList, con
|
|||||||
if (gds.size() > 0) {
|
if (gds.size() > 0) {
|
||||||
|
|
||||||
for( size_t i=0; i<gds.size(); i++ ) {
|
for( size_t i=0; i<gds.size(); i++ ) {
|
||||||
size_t pd_numinputscore = gds[i]->GetNumInputScores();
|
size_t pd_numinputscore = gds[i]->GetNumInputScores();
|
||||||
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( gds[i] );
|
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( gds[i] );
|
||||||
for (size_t j = 0; j<scores.size(); ++j){
|
for (size_t j = 0; j<scores.size(); ++j) {
|
||||||
|
|
||||||
if (labeledOutput && (i == 0) ){
|
if (labeledOutput && (i == 0) ) {
|
||||||
if ((j == 0) || (j == pd_numinputscore)){
|
if ((j == 0) || (j == pd_numinputscore)) {
|
||||||
lastName = gds[i]->GetScoreProducerWeightShortName(j);
|
lastName = gds[i]->GetScoreProducerWeightShortName(j);
|
||||||
out << " " << lastName << ":";
|
out << " " << lastName << ":";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
out << " " << scores[j];
|
out << " " << scores[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -477,7 +477,7 @@ void OutputNBest(std::ostream& out, const Moses::TrellisPathList &nBestList, con
|
|||||||
const int sourceOffset = sourceRange.GetStartPos();
|
const int sourceOffset = sourceRange.GetStartPos();
|
||||||
const int targetOffset = targetRange.GetStartPos();
|
const int targetOffset = targetRange.GetStartPos();
|
||||||
const AlignmentInfo &ai = edge.GetCurrTargetPhrase().GetAlignmentInfo();
|
const AlignmentInfo &ai = edge.GetCurrTargetPhrase().GetAlignmentInfo();
|
||||||
|
|
||||||
OutputAlignment(out, ai, sourceOffset, targetOffset);
|
OutputAlignment(out, ai, sourceOffset, targetOffset);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -83,7 +83,7 @@ public:
|
|||||||
m_detailedTranslationCollector(detailedTranslationCollector),
|
m_detailedTranslationCollector(detailedTranslationCollector),
|
||||||
m_alignmentInfoCollector(alignmentInfoCollector) {}
|
m_alignmentInfoCollector(alignmentInfoCollector) {}
|
||||||
|
|
||||||
/** Translate one sentence
|
/** Translate one sentence
|
||||||
* gets called by main function implemented at end of this source file */
|
* gets called by main function implemented at end of this source file */
|
||||||
void Run() {
|
void Run() {
|
||||||
|
|
||||||
@ -130,7 +130,7 @@ public:
|
|||||||
manager.SerializeSearchGraphPB(m_lineNumber, output);
|
manager.SerializeSearchGraphPB(m_lineNumber, output);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// apply decision rule and output best translation(s)
|
// apply decision rule and output best translation(s)
|
||||||
if (m_outputCollector) {
|
if (m_outputCollector) {
|
||||||
@ -145,8 +145,7 @@ public:
|
|||||||
|
|
||||||
// MAP decoding: best hypothesis
|
// MAP decoding: best hypothesis
|
||||||
const Hypothesis* bestHypo = NULL;
|
const Hypothesis* bestHypo = NULL;
|
||||||
if (!staticData.UseMBR())
|
if (!staticData.UseMBR()) {
|
||||||
{
|
|
||||||
bestHypo = manager.GetBestHypothesis();
|
bestHypo = manager.GetBestHypothesis();
|
||||||
if (bestHypo) {
|
if (bestHypo) {
|
||||||
if (staticData.IsPathRecoveryEnabled()) {
|
if (staticData.IsPathRecoveryEnabled()) {
|
||||||
@ -165,11 +164,10 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
out << endl;
|
out << endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
// MBR decoding (n-best MBR, lattice MBR, consensus)
|
// MBR decoding (n-best MBR, lattice MBR, consensus)
|
||||||
else
|
else {
|
||||||
{
|
|
||||||
// we first need the n-best translations
|
// we first need the n-best translations
|
||||||
size_t nBestSize = staticData.GetMBRSize();
|
size_t nBestSize = staticData.GetMBRSize();
|
||||||
if (nBestSize <= 0) {
|
if (nBestSize <= 0) {
|
||||||
@ -205,7 +203,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// consensus decoding
|
// consensus decoding
|
||||||
else if (staticData.UseConsensusDecoding()) {
|
else if (staticData.UseConsensusDecoding()) {
|
||||||
const TrellisPath &conBestHypo = doConsensusDecoding(manager,nBestList);
|
const TrellisPath &conBestHypo = doConsensusDecoding(manager,nBestList);
|
||||||
OutputBestHypo(conBestHypo, m_lineNumber,
|
OutputBestHypo(conBestHypo, m_lineNumber,
|
||||||
staticData.GetReportSegmentation(),
|
staticData.GetReportSegmentation(),
|
||||||
@ -214,8 +212,8 @@ public:
|
|||||||
IFVERBOSE(2) {
|
IFVERBOSE(2) {
|
||||||
PrintUserTime("finished Consensus decoding");
|
PrintUserTime("finished Consensus decoding");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// n-best MBR decoding
|
// n-best MBR decoding
|
||||||
else {
|
else {
|
||||||
const Moses::TrellisPath &mbrBestHypo = doMBR(nBestList);
|
const Moses::TrellisPath &mbrBestHypo = doMBR(nBestList);
|
||||||
@ -482,7 +480,7 @@ int main(int argc, char** argv)
|
|||||||
alignmentInfoCollector.get() );
|
alignmentInfoCollector.get() );
|
||||||
// execute task
|
// execute task
|
||||||
#ifdef WITH_THREADS
|
#ifdef WITH_THREADS
|
||||||
pool.Submit(task);
|
pool.Submit(task);
|
||||||
#else
|
#else
|
||||||
task->Run();
|
task->Run();
|
||||||
#endif
|
#endif
|
||||||
|
@ -57,7 +57,7 @@ void PrintTranslationAnalysis(const TranslationSystem* system, std::ostream &os,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool epsilon = false;
|
bool epsilon = false;
|
||||||
if (target == "") {
|
if (target == "") {
|
||||||
target="<EPSILON>";
|
target="<EPSILON>";
|
||||||
|
@ -1,17 +1,17 @@
|
|||||||
/***********************************************************************
|
/***********************************************************************
|
||||||
Moses - statistical machine translation system
|
Moses - statistical machine translation system
|
||||||
Copyright (C) 2006-2011 University of Edinburgh
|
Copyright (C) 2006-2011 University of Edinburgh
|
||||||
|
|
||||||
This library is free software; you can redistribute it and/or
|
This library is free software; you can redistribute it and/or
|
||||||
modify it under the terms of the GNU Lesser General Public
|
modify it under the terms of the GNU Lesser General Public
|
||||||
License as published by the Free Software Foundation; either
|
License as published by the Free Software Foundation; either
|
||||||
version 2.1 of the License, or (at your option) any later version.
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
This library is distributed in the hope that it will be useful,
|
This library is distributed in the hope that it will be useful,
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
Lesser General Public License for more details.
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
You should have received a copy of the GNU Lesser General Public
|
You should have received a copy of the GNU Lesser General Public
|
||||||
License along with this library; if not, write to the Free Software
|
License along with this library; if not, write to the Free Software
|
||||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
@ -42,10 +42,11 @@ void AlignmentInfo::BuildNonTermIndexMap()
|
|||||||
for (p = begin(); p != end(); ++p) {
|
for (p = begin(); p != end(); ++p) {
|
||||||
m_nonTermIndexMap[p->second] = i++;
|
m_nonTermIndexMap[p->second] = i++;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool compare_target(const std::pair<size_t,size_t> *a, const std::pair<size_t,size_t> *b) {
|
bool compare_target(const std::pair<size_t,size_t> *a, const std::pair<size_t,size_t> *b)
|
||||||
|
{
|
||||||
if(a->second < b->second) return true;
|
if(a->second < b->second) return true;
|
||||||
if(a->second == b->second) return (a->first < b->first);
|
if(a->second == b->second) return (a->first < b->first);
|
||||||
return false;
|
return false;
|
||||||
@ -55,34 +56,32 @@ bool compare_target(const std::pair<size_t,size_t> *a, const std::pair<size_t,si
|
|||||||
std::vector< const std::pair<size_t,size_t>* > AlignmentInfo::GetSortedAlignments() const
|
std::vector< const std::pair<size_t,size_t>* > AlignmentInfo::GetSortedAlignments() const
|
||||||
{
|
{
|
||||||
std::vector< const std::pair<size_t,size_t>* > ret;
|
std::vector< const std::pair<size_t,size_t>* > ret;
|
||||||
|
|
||||||
CollType::const_iterator iter;
|
CollType::const_iterator iter;
|
||||||
for (iter = m_collection.begin(); iter != m_collection.end(); ++iter)
|
for (iter = m_collection.begin(); iter != m_collection.end(); ++iter) {
|
||||||
{
|
|
||||||
const std::pair<size_t,size_t> &alignPair = *iter;
|
const std::pair<size_t,size_t> &alignPair = *iter;
|
||||||
ret.push_back(&alignPair);
|
ret.push_back(&alignPair);
|
||||||
}
|
}
|
||||||
|
|
||||||
const StaticData &staticData = StaticData::Instance();
|
const StaticData &staticData = StaticData::Instance();
|
||||||
WordAlignmentSort wordAlignmentSort = staticData.GetWordAlignmentSort();
|
WordAlignmentSort wordAlignmentSort = staticData.GetWordAlignmentSort();
|
||||||
|
|
||||||
switch (wordAlignmentSort)
|
switch (wordAlignmentSort) {
|
||||||
{
|
case NoSort:
|
||||||
case NoSort:
|
break;
|
||||||
break;
|
|
||||||
|
case TargetOrder:
|
||||||
case TargetOrder:
|
std::sort(ret.begin(), ret.end(), compare_target);
|
||||||
std::sort(ret.begin(), ret.end(), compare_target);
|
break;
|
||||||
break;
|
|
||||||
|
default:
|
||||||
default:
|
CHECK(false);
|
||||||
CHECK(false);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::ostream& operator<<(std::ostream &out, const AlignmentInfo &alignmentInfo)
|
std::ostream& operator<<(std::ostream &out, const AlignmentInfo &alignmentInfo)
|
||||||
{
|
{
|
||||||
AlignmentInfo::const_iterator iter;
|
AlignmentInfo::const_iterator iter;
|
||||||
|
@ -1,17 +1,17 @@
|
|||||||
/***********************************************************************
|
/***********************************************************************
|
||||||
Moses - statistical machine translation system
|
Moses - statistical machine translation system
|
||||||
Copyright (C) 2006-2011 University of Edinburgh
|
Copyright (C) 2006-2011 University of Edinburgh
|
||||||
|
|
||||||
This library is free software; you can redistribute it and/or
|
This library is free software; you can redistribute it and/or
|
||||||
modify it under the terms of the GNU Lesser General Public
|
modify it under the terms of the GNU Lesser General Public
|
||||||
License as published by the Free Software Foundation; either
|
License as published by the Free Software Foundation; either
|
||||||
version 2.1 of the License, or (at your option) any later version.
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
This library is distributed in the hope that it will be useful,
|
This library is distributed in the hope that it will be useful,
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
Lesser General Public License for more details.
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
You should have received a copy of the GNU Lesser General Public
|
You should have received a copy of the GNU Lesser General Public
|
||||||
License along with this library; if not, write to the Free Software
|
License along with this library; if not, write to the Free Software
|
||||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
@ -37,12 +37,16 @@ class AlignmentInfo
|
|||||||
friend struct AlignmentInfoOrderer;
|
friend struct AlignmentInfoOrderer;
|
||||||
friend class AlignmentInfoCollection;
|
friend class AlignmentInfoCollection;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
typedef std::vector<size_t> NonTermIndexMap;
|
typedef std::vector<size_t> NonTermIndexMap;
|
||||||
typedef CollType::const_iterator const_iterator;
|
typedef CollType::const_iterator const_iterator;
|
||||||
|
|
||||||
const_iterator begin() const { return m_collection.begin(); }
|
const_iterator begin() const {
|
||||||
const_iterator end() const { return m_collection.end(); }
|
return m_collection.begin();
|
||||||
|
}
|
||||||
|
const_iterator end() const {
|
||||||
|
return m_collection.end();
|
||||||
|
}
|
||||||
|
|
||||||
// Provides a map from target-side to source-side non-terminal indices.
|
// Provides a map from target-side to source-side non-terminal indices.
|
||||||
// The target-side index should be the rule symbol index (counting terminals).
|
// The target-side index should be the rule symbol index (counting terminals).
|
||||||
@ -52,12 +56,11 @@ class AlignmentInfo
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::vector< const std::pair<size_t,size_t>* > GetSortedAlignments() const;
|
std::vector< const std::pair<size_t,size_t>* > GetSortedAlignments() const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// AlignmentInfo objects should only be created by an AlignmentInfoCollection
|
// AlignmentInfo objects should only be created by an AlignmentInfoCollection
|
||||||
explicit AlignmentInfo(const std::set<std::pair<size_t,size_t> > &pairs)
|
explicit AlignmentInfo(const std::set<std::pair<size_t,size_t> > &pairs)
|
||||||
: m_collection(pairs)
|
: m_collection(pairs) {
|
||||||
{
|
|
||||||
BuildNonTermIndexMap();
|
BuildNonTermIndexMap();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -69,8 +72,7 @@ class AlignmentInfo
|
|||||||
|
|
||||||
// Define an arbitrary strict weak ordering between AlignmentInfo objects
|
// Define an arbitrary strict weak ordering between AlignmentInfo objects
|
||||||
// for use by AlignmentInfoCollection.
|
// for use by AlignmentInfoCollection.
|
||||||
struct AlignmentInfoOrderer
|
struct AlignmentInfoOrderer {
|
||||||
{
|
|
||||||
bool operator()(const AlignmentInfo &a, const AlignmentInfo &b) const {
|
bool operator()(const AlignmentInfo &a, const AlignmentInfo &b) const {
|
||||||
return a.m_collection < b.m_collection;
|
return a.m_collection < b.m_collection;
|
||||||
}
|
}
|
||||||
|
@ -1,17 +1,17 @@
|
|||||||
/***********************************************************************
|
/***********************************************************************
|
||||||
Moses - statistical machine translation system
|
Moses - statistical machine translation system
|
||||||
Copyright (C) 2006-2011 University of Edinburgh
|
Copyright (C) 2006-2011 University of Edinburgh
|
||||||
|
|
||||||
This library is free software; you can redistribute it and/or
|
This library is free software; you can redistribute it and/or
|
||||||
modify it under the terms of the GNU Lesser General Public
|
modify it under the terms of the GNU Lesser General Public
|
||||||
License as published by the Free Software Foundation; either
|
License as published by the Free Software Foundation; either
|
||||||
version 2.1 of the License, or (at your option) any later version.
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
This library is distributed in the hope that it will be useful,
|
This library is distributed in the hope that it will be useful,
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
Lesser General Public License for more details.
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
You should have received a copy of the GNU Lesser General Public
|
You should have received a copy of the GNU Lesser General Public
|
||||||
License along with this library; if not, write to the Free Software
|
License along with this library; if not, write to the Free Software
|
||||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
@ -36,7 +36,7 @@ const AlignmentInfo &AlignmentInfoCollection::GetEmptyAlignmentInfo() const
|
|||||||
}
|
}
|
||||||
|
|
||||||
const AlignmentInfo *AlignmentInfoCollection::Add(
|
const AlignmentInfo *AlignmentInfoCollection::Add(
|
||||||
const std::set<std::pair<size_t,size_t> > &pairs)
|
const std::set<std::pair<size_t,size_t> > &pairs)
|
||||||
{
|
{
|
||||||
std::pair<AlignmentInfoSet::iterator, bool> ret =
|
std::pair<AlignmentInfoSet::iterator, bool> ret =
|
||||||
m_collection.insert(AlignmentInfo(pairs));
|
m_collection.insert(AlignmentInfo(pairs));
|
||||||
|
@ -1,17 +1,17 @@
|
|||||||
/***********************************************************************
|
/***********************************************************************
|
||||||
Moses - statistical machine translation system
|
Moses - statistical machine translation system
|
||||||
Copyright (C) 2006-2011 University of Edinburgh
|
Copyright (C) 2006-2011 University of Edinburgh
|
||||||
|
|
||||||
This library is free software; you can redistribute it and/or
|
This library is free software; you can redistribute it and/or
|
||||||
modify it under the terms of the GNU Lesser General Public
|
modify it under the terms of the GNU Lesser General Public
|
||||||
License as published by the Free Software Foundation; either
|
License as published by the Free Software Foundation; either
|
||||||
version 2.1 of the License, or (at your option) any later version.
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
This library is distributed in the hope that it will be useful,
|
This library is distributed in the hope that it will be useful,
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
Lesser General Public License for more details.
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
You should have received a copy of the GNU Lesser General Public
|
You should have received a copy of the GNU Lesser General Public
|
||||||
License along with this library; if not, write to the Free Software
|
License along with this library; if not, write to the Free Software
|
||||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
@ -29,8 +29,10 @@ namespace Moses
|
|||||||
// Singleton collection of all AlignmentInfo objects.
|
// Singleton collection of all AlignmentInfo objects.
|
||||||
class AlignmentInfoCollection
|
class AlignmentInfoCollection
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
static AlignmentInfoCollection &Instance() { return s_instance; }
|
static AlignmentInfoCollection &Instance() {
|
||||||
|
return s_instance;
|
||||||
|
}
|
||||||
|
|
||||||
// Returns a pointer to an AlignmentInfo object with the same source-target
|
// Returns a pointer to an AlignmentInfo object with the same source-target
|
||||||
// alignment pairs as given in the argument. If the collection already
|
// alignment pairs as given in the argument. If the collection already
|
||||||
@ -41,7 +43,7 @@ class AlignmentInfoCollection
|
|||||||
// Returns a pointer to an empty AlignmentInfo object.
|
// Returns a pointer to an empty AlignmentInfo object.
|
||||||
const AlignmentInfo &GetEmptyAlignmentInfo() const;
|
const AlignmentInfo &GetEmptyAlignmentInfo() const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
typedef std::set<AlignmentInfo, AlignmentInfoOrderer> AlignmentInfoSet;
|
typedef std::set<AlignmentInfo, AlignmentInfoOrderer> AlignmentInfoSet;
|
||||||
|
|
||||||
// Only a single static variable should be created.
|
// Only a single static variable should be created.
|
||||||
|
@ -7,455 +7,454 @@
|
|||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
namespace Moses {
|
namespace Moses
|
||||||
|
{
|
||||||
|
|
||||||
BilingualDynSuffixArray::BilingualDynSuffixArray():
|
BilingualDynSuffixArray::BilingualDynSuffixArray():
|
||||||
m_maxPhraseLength(StaticData::Instance().GetMaxPhraseLength()),
|
m_maxPhraseLength(StaticData::Instance().GetMaxPhraseLength()),
|
||||||
m_maxSampleSize(20)
|
m_maxSampleSize(20)
|
||||||
{
|
{
|
||||||
m_srcSA = 0;
|
m_srcSA = 0;
|
||||||
m_trgSA = 0;
|
m_trgSA = 0;
|
||||||
m_srcCorpus = new std::vector<wordID_t>();
|
m_srcCorpus = new std::vector<wordID_t>();
|
||||||
m_trgCorpus = new std::vector<wordID_t>();
|
m_trgCorpus = new std::vector<wordID_t>();
|
||||||
m_srcVocab = new Vocab(false);
|
m_srcVocab = new Vocab(false);
|
||||||
m_trgVocab = new Vocab(false);
|
m_trgVocab = new Vocab(false);
|
||||||
m_scoreCmp = 0;
|
m_scoreCmp = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
BilingualDynSuffixArray::~BilingualDynSuffixArray()
|
BilingualDynSuffixArray::~BilingualDynSuffixArray()
|
||||||
{
|
{
|
||||||
if(m_srcSA) delete m_srcSA;
|
if(m_srcSA) delete m_srcSA;
|
||||||
if(m_trgSA) delete m_trgSA;
|
if(m_trgSA) delete m_trgSA;
|
||||||
if(m_srcVocab) delete m_srcVocab;
|
if(m_srcVocab) delete m_srcVocab;
|
||||||
if(m_trgVocab) delete m_trgVocab;
|
if(m_trgVocab) delete m_trgVocab;
|
||||||
if(m_srcCorpus) delete m_srcCorpus;
|
if(m_srcCorpus) delete m_srcCorpus;
|
||||||
if(m_trgCorpus) delete m_trgCorpus;
|
if(m_trgCorpus) delete m_trgCorpus;
|
||||||
if(m_scoreCmp) delete m_scoreCmp;
|
if(m_scoreCmp) delete m_scoreCmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool BilingualDynSuffixArray::Load(
|
bool BilingualDynSuffixArray::Load(
|
||||||
const std::vector<FactorType>& inputFactors,
|
const std::vector<FactorType>& inputFactors,
|
||||||
const std::vector<FactorType>& outputFactors,
|
const std::vector<FactorType>& outputFactors,
|
||||||
std::string source, std::string target, std::string alignments,
|
std::string source, std::string target, std::string alignments,
|
||||||
const std::vector<float> &weight)
|
const std::vector<float> &weight)
|
||||||
{
|
{
|
||||||
m_inputFactors = inputFactors;
|
m_inputFactors = inputFactors;
|
||||||
m_outputFactors = outputFactors;
|
m_outputFactors = outputFactors;
|
||||||
|
|
||||||
m_scoreCmp = new ScoresComp(weight);
|
m_scoreCmp = new ScoresComp(weight);
|
||||||
InputFileStream sourceStrme(source);
|
InputFileStream sourceStrme(source);
|
||||||
InputFileStream targetStrme(target);
|
InputFileStream targetStrme(target);
|
||||||
cerr << "Loading source corpus...\n";
|
cerr << "Loading source corpus...\n";
|
||||||
LoadCorpus(sourceStrme, m_inputFactors, *m_srcCorpus, m_srcSntBreaks, m_srcVocab);
|
LoadCorpus(sourceStrme, m_inputFactors, *m_srcCorpus, m_srcSntBreaks, m_srcVocab);
|
||||||
cerr << "Loading target corpus...\n";
|
cerr << "Loading target corpus...\n";
|
||||||
LoadCorpus(targetStrme, m_outputFactors,*m_trgCorpus, m_trgSntBreaks, m_trgVocab);
|
LoadCorpus(targetStrme, m_outputFactors,*m_trgCorpus, m_trgSntBreaks, m_trgVocab);
|
||||||
CHECK(m_srcSntBreaks.size() == m_trgSntBreaks.size());
|
CHECK(m_srcSntBreaks.size() == m_trgSntBreaks.size());
|
||||||
|
|
||||||
// build suffix arrays and auxilliary arrays
|
// build suffix arrays and auxilliary arrays
|
||||||
cerr << "Building Source Suffix Array...\n";
|
cerr << "Building Source Suffix Array...\n";
|
||||||
m_srcSA = new DynSuffixArray(m_srcCorpus);
|
m_srcSA = new DynSuffixArray(m_srcCorpus);
|
||||||
if(!m_srcSA) return false;
|
if(!m_srcSA) return false;
|
||||||
cerr << "Building Target Suffix Array...\n";
|
cerr << "Building Target Suffix Array...\n";
|
||||||
//m_trgSA = new DynSuffixArray(m_trgCorpus);
|
//m_trgSA = new DynSuffixArray(m_trgCorpus);
|
||||||
//if(!m_trgSA) return false;
|
//if(!m_trgSA) return false;
|
||||||
cerr << "\t(Skipped. Not used)\n";
|
cerr << "\t(Skipped. Not used)\n";
|
||||||
|
|
||||||
InputFileStream alignStrme(alignments);
|
InputFileStream alignStrme(alignments);
|
||||||
cerr << "Loading Alignment File...\n";
|
cerr << "Loading Alignment File...\n";
|
||||||
LoadRawAlignments(alignStrme);
|
LoadRawAlignments(alignStrme);
|
||||||
//LoadAlignments(alignStrme);
|
//LoadAlignments(alignStrme);
|
||||||
cerr << "Building frequent word cache...\n";
|
cerr << "Building frequent word cache...\n";
|
||||||
CacheFreqWords();
|
CacheFreqWords();
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
int BilingualDynSuffixArray::LoadRawAlignments(InputFileStream& align)
|
int BilingualDynSuffixArray::LoadRawAlignments(InputFileStream& align)
|
||||||
{
|
{
|
||||||
// stores the alignments in the raw file format
|
// stores the alignments in the raw file format
|
||||||
std::string line;
|
std::string line;
|
||||||
std::vector<int> vtmp;
|
std::vector<int> vtmp;
|
||||||
while(getline(align, line)) {
|
while(getline(align, line)) {
|
||||||
Utils::splitToInt(line, vtmp, "- ");
|
Utils::splitToInt(line, vtmp, "- ");
|
||||||
CHECK(vtmp.size() % 2 == 0);
|
CHECK(vtmp.size() % 2 == 0);
|
||||||
std::vector<short> vAlgn; // store as short ints for memory
|
std::vector<short> vAlgn; // store as short ints for memory
|
||||||
for (std::vector<int>::const_iterator itr = vtmp.begin();
|
for (std::vector<int>::const_iterator itr = vtmp.begin();
|
||||||
itr != vtmp.end(); ++itr) {
|
itr != vtmp.end(); ++itr) {
|
||||||
vAlgn.push_back(short(*itr));
|
vAlgn.push_back(short(*itr));
|
||||||
}
|
}
|
||||||
m_rawAlignments.push_back(vAlgn);
|
m_rawAlignments.push_back(vAlgn);
|
||||||
}
|
}
|
||||||
return m_rawAlignments.size();
|
return m_rawAlignments.size();
|
||||||
}
|
}
|
||||||
int BilingualDynSuffixArray::LoadRawAlignments(string& align) {
|
int BilingualDynSuffixArray::LoadRawAlignments(string& align)
|
||||||
// stores the alignments in the raw file format
|
{
|
||||||
|
// stores the alignments in the raw file format
|
||||||
vector<int> vtmp;
|
vector<int> vtmp;
|
||||||
Utils::splitToInt(align, vtmp, "- ");
|
Utils::splitToInt(align, vtmp, "- ");
|
||||||
CHECK(vtmp.size() % 2 == 0);
|
CHECK(vtmp.size() % 2 == 0);
|
||||||
vector<short> vAlgn; // store as short ints for memory
|
vector<short> vAlgn; // store as short ints for memory
|
||||||
for (std::vector<int>::const_iterator itr = vtmp.begin();
|
for (std::vector<int>::const_iterator itr = vtmp.begin();
|
||||||
itr != vtmp.end(); ++itr) {
|
itr != vtmp.end(); ++itr) {
|
||||||
vAlgn.push_back(short(*itr));
|
vAlgn.push_back(short(*itr));
|
||||||
}
|
}
|
||||||
m_rawAlignments.push_back(vAlgn);
|
m_rawAlignments.push_back(vAlgn);
|
||||||
return m_rawAlignments.size();
|
return m_rawAlignments.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
int BilingualDynSuffixArray::LoadAlignments(InputFileStream& align)
|
int BilingualDynSuffixArray::LoadAlignments(InputFileStream& align)
|
||||||
{
|
{
|
||||||
std::string line;
|
std::string line;
|
||||||
std::vector<int> vtmp;
|
std::vector<int> vtmp;
|
||||||
int sntIndex(0);
|
int sntIndex(0);
|
||||||
|
|
||||||
while(getline(align, line)) {
|
|
||||||
Utils::splitToInt(line, vtmp, "- ");
|
|
||||||
CHECK(vtmp.size() % 2 == 0);
|
|
||||||
|
|
||||||
int sourceSize = GetSourceSentenceSize(sntIndex);
|
|
||||||
int targetSize = GetTargetSentenceSize(sntIndex);
|
|
||||||
|
|
||||||
SentenceAlignment curSnt(sntIndex, sourceSize, targetSize); // initialize empty sentence
|
while(getline(align, line)) {
|
||||||
for(int i=0; i < (int)vtmp.size(); i+=2) {
|
Utils::splitToInt(line, vtmp, "- ");
|
||||||
int sourcePos = vtmp[i];
|
CHECK(vtmp.size() % 2 == 0);
|
||||||
int targetPos = vtmp[i+1];
|
|
||||||
CHECK(sourcePos < sourceSize);
|
int sourceSize = GetSourceSentenceSize(sntIndex);
|
||||||
CHECK(targetPos < targetSize);
|
int targetSize = GetTargetSentenceSize(sntIndex);
|
||||||
|
|
||||||
curSnt.alignedList[sourcePos].push_back(targetPos); // list of target nodes for each source word
|
SentenceAlignment curSnt(sntIndex, sourceSize, targetSize); // initialize empty sentence
|
||||||
curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word
|
for(int i=0; i < (int)vtmp.size(); i+=2) {
|
||||||
}
|
int sourcePos = vtmp[i];
|
||||||
curSnt.srcSnt = m_srcCorpus + sntIndex; // point source and target sentence
|
int targetPos = vtmp[i+1];
|
||||||
curSnt.trgSnt = m_trgCorpus + sntIndex;
|
CHECK(sourcePos < sourceSize);
|
||||||
m_alignments.push_back(curSnt);
|
CHECK(targetPos < targetSize);
|
||||||
|
|
||||||
sntIndex++;
|
curSnt.alignedList[sourcePos].push_back(targetPos); // list of target nodes for each source word
|
||||||
}
|
curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word
|
||||||
return m_alignments.size();
|
}
|
||||||
|
curSnt.srcSnt = m_srcCorpus + sntIndex; // point source and target sentence
|
||||||
|
curSnt.trgSnt = m_trgCorpus + sntIndex;
|
||||||
|
m_alignments.push_back(curSnt);
|
||||||
|
|
||||||
|
sntIndex++;
|
||||||
|
}
|
||||||
|
return m_alignments.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
SentenceAlignment BilingualDynSuffixArray::GetSentenceAlignment(const int sntIndex, bool trg2Src) const
|
SentenceAlignment BilingualDynSuffixArray::GetSentenceAlignment(const int sntIndex, bool trg2Src) const
|
||||||
{
|
{
|
||||||
// retrieves the alignments in the format used by SentenceAlignment.Extract()
|
// retrieves the alignments in the format used by SentenceAlignment.Extract()
|
||||||
int sntGiven = trg2Src ? GetTargetSentenceSize(sntIndex) : GetSourceSentenceSize(sntIndex);
|
int sntGiven = trg2Src ? GetTargetSentenceSize(sntIndex) : GetSourceSentenceSize(sntIndex);
|
||||||
int sntExtract = trg2Src ? GetSourceSentenceSize(sntIndex) : GetTargetSentenceSize(sntIndex);
|
int sntExtract = trg2Src ? GetSourceSentenceSize(sntIndex) : GetTargetSentenceSize(sntIndex);
|
||||||
std::vector<short> alignment = m_rawAlignments.at(sntIndex);
|
std::vector<short> alignment = m_rawAlignments.at(sntIndex);
|
||||||
SentenceAlignment curSnt(sntIndex, sntGiven, sntExtract); // initialize empty sentence
|
SentenceAlignment curSnt(sntIndex, sntGiven, sntExtract); // initialize empty sentence
|
||||||
for(size_t i=0; i < alignment.size(); i+=2) {
|
for(size_t i=0; i < alignment.size(); i+=2) {
|
||||||
int sourcePos = alignment[i];
|
int sourcePos = alignment[i];
|
||||||
int targetPos = alignment[i+1];
|
int targetPos = alignment[i+1];
|
||||||
if(trg2Src) {
|
if(trg2Src) {
|
||||||
curSnt.alignedList[targetPos].push_back(sourcePos); // list of target nodes for each source word
|
curSnt.alignedList[targetPos].push_back(sourcePos); // list of target nodes for each source word
|
||||||
curSnt.numberAligned[sourcePos]++; // cnt of how many source words connect to this target word
|
curSnt.numberAligned[sourcePos]++; // cnt of how many source words connect to this target word
|
||||||
}
|
} else {
|
||||||
else {
|
curSnt.alignedList[sourcePos].push_back(targetPos); // list of target nodes for each source word
|
||||||
curSnt.alignedList[sourcePos].push_back(targetPos); // list of target nodes for each source word
|
curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word
|
||||||
curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word
|
}
|
||||||
}
|
}
|
||||||
}
|
curSnt.srcSnt = m_srcCorpus + sntIndex; // point source and target sentence
|
||||||
curSnt.srcSnt = m_srcCorpus + sntIndex; // point source and target sentence
|
curSnt.trgSnt = m_trgCorpus + sntIndex;
|
||||||
curSnt.trgSnt = m_trgCorpus + sntIndex;
|
|
||||||
|
return curSnt;
|
||||||
return curSnt;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool BilingualDynSuffixArray::ExtractPhrases(const int& sntIndex, const int& wordIndex,
|
bool BilingualDynSuffixArray::ExtractPhrases(const int& sntIndex, const int& wordIndex,
|
||||||
const int& sourceSize, std::vector<PhrasePair*>& phrasePairs, bool trg2Src) const
|
const int& sourceSize, std::vector<PhrasePair*>& phrasePairs, bool trg2Src) const
|
||||||
{
|
{
|
||||||
/* ExtractPhrases() can extract the matching phrases for both directions by using the trg2Src
|
/* ExtractPhrases() can extract the matching phrases for both directions by using the trg2Src
|
||||||
* parameter */
|
* parameter */
|
||||||
SentenceAlignment curSnt = GetSentenceAlignment(sntIndex, trg2Src);
|
SentenceAlignment curSnt = GetSentenceAlignment(sntIndex, trg2Src);
|
||||||
// get span of phrase in source sentence
|
// get span of phrase in source sentence
|
||||||
int beginSentence = m_srcSntBreaks[sntIndex];
|
int beginSentence = m_srcSntBreaks[sntIndex];
|
||||||
int rightIdx = wordIndex - beginSentence
|
int rightIdx = wordIndex - beginSentence
|
||||||
,leftIdx = rightIdx - sourceSize + 1;
|
,leftIdx = rightIdx - sourceSize + 1;
|
||||||
return curSnt.Extract(m_maxPhraseLength, phrasePairs, leftIdx, rightIdx); // extract all phrase Alignments in sentence
|
return curSnt.Extract(m_maxPhraseLength, phrasePairs, leftIdx, rightIdx); // extract all phrase Alignments in sentence
|
||||||
}
|
}
|
||||||
|
|
||||||
void BilingualDynSuffixArray::CleanUp()
|
void BilingualDynSuffixArray::CleanUp()
|
||||||
{
|
{
|
||||||
//m_wordPairCache.clear();
|
//m_wordPairCache.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
int BilingualDynSuffixArray::LoadCorpus(InputFileStream& corpus, const FactorList& factors,
|
int BilingualDynSuffixArray::LoadCorpus(InputFileStream& corpus, const FactorList& factors,
|
||||||
std::vector<wordID_t>& cArray, std::vector<wordID_t>& sntArray,
|
std::vector<wordID_t>& cArray, std::vector<wordID_t>& sntArray,
|
||||||
Vocab* vocab)
|
Vocab* vocab)
|
||||||
{
|
{
|
||||||
std::string line, word;
|
std::string line, word;
|
||||||
int sntIdx(0);
|
int sntIdx(0);
|
||||||
// corpus.seekg(0); Seems needless -> commented out to allow loading of gzipped corpora (gzfilebuf doesn't support seeking).
|
// corpus.seekg(0); Seems needless -> commented out to allow loading of gzipped corpora (gzfilebuf doesn't support seeking).
|
||||||
const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
|
const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
|
||||||
while(getline(corpus, line)) {
|
while(getline(corpus, line)) {
|
||||||
sntArray.push_back(sntIdx);
|
sntArray.push_back(sntIdx);
|
||||||
Phrase phrase(ARRAY_SIZE_INCR);
|
Phrase phrase(ARRAY_SIZE_INCR);
|
||||||
// parse phrase
|
// parse phrase
|
||||||
phrase.CreateFromString( factors, line, factorDelimiter);
|
phrase.CreateFromString( factors, line, factorDelimiter);
|
||||||
// store words in vocabulary and corpus
|
// store words in vocabulary and corpus
|
||||||
for( size_t i = 0; i < phrase.GetSize(); ++i) {
|
for( size_t i = 0; i < phrase.GetSize(); ++i) {
|
||||||
cArray.push_back( vocab->GetWordID(phrase.GetWord(i)) );
|
cArray.push_back( vocab->GetWordID(phrase.GetWord(i)) );
|
||||||
}
|
}
|
||||||
sntIdx += phrase.GetSize();
|
sntIdx += phrase.GetSize();
|
||||||
}
|
}
|
||||||
//cArray.push_back(vocab->GetkOOVWordID); // signify end of corpus
|
//cArray.push_back(vocab->GetkOOVWordID); // signify end of corpus
|
||||||
vocab->MakeClosed(); // avoid adding words
|
vocab->MakeClosed(); // avoid adding words
|
||||||
return cArray.size();
|
return cArray.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
bool BilingualDynSuffixArray::GetLocalVocabIDs(const Phrase& src, SAPhrase &output) const
|
bool BilingualDynSuffixArray::GetLocalVocabIDs(const Phrase& src, SAPhrase &output) const
|
||||||
{
|
{
|
||||||
// looks up the SA vocab ids for the current src phrase
|
// looks up the SA vocab ids for the current src phrase
|
||||||
size_t phraseSize = src.GetSize();
|
size_t phraseSize = src.GetSize();
|
||||||
for (size_t pos = 0; pos < phraseSize; ++pos) {
|
for (size_t pos = 0; pos < phraseSize; ++pos) {
|
||||||
const Word &word = src.GetWord(pos);
|
const Word &word = src.GetWord(pos);
|
||||||
wordID_t arrayId = m_srcVocab->GetWordID(word);
|
wordID_t arrayId = m_srcVocab->GetWordID(word);
|
||||||
if (arrayId == m_srcVocab->GetkOOVWordID())
|
if (arrayId == m_srcVocab->GetkOOVWordID()) {
|
||||||
{ // oov
|
// oov
|
||||||
return false;
|
return false;
|
||||||
}
|
} else {
|
||||||
else
|
output.SetId(pos, arrayId);
|
||||||
{
|
//cerr << arrayId << " ";
|
||||||
output.SetId(pos, arrayId);
|
}
|
||||||
//cerr << arrayId << " ";
|
}
|
||||||
}
|
return true;
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pair<float, float> BilingualDynSuffixArray::GetLexicalWeight(const PhrasePair& phrasepair) const
|
pair<float, float> BilingualDynSuffixArray::GetLexicalWeight(const PhrasePair& phrasepair) const
|
||||||
{
|
{
|
||||||
//return pair<float, float>(1, 1);
|
//return pair<float, float>(1, 1);
|
||||||
float srcLexWeight(1.0), trgLexWeight(1.0);
|
float srcLexWeight(1.0), trgLexWeight(1.0);
|
||||||
std::map<pair<wordID_t, wordID_t>, float> targetProbs; // collect sum of target probs given source words
|
std::map<pair<wordID_t, wordID_t>, float> targetProbs; // collect sum of target probs given source words
|
||||||
//const SentenceAlignment& alignment = m_alignments[phrasepair.m_sntIndex];
|
//const SentenceAlignment& alignment = m_alignments[phrasepair.m_sntIndex];
|
||||||
const SentenceAlignment& alignment = GetSentenceAlignment(phrasepair.m_sntIndex);
|
const SentenceAlignment& alignment = GetSentenceAlignment(phrasepair.m_sntIndex);
|
||||||
std::map<pair<wordID_t, wordID_t>, pair<float, float> >::const_iterator itrCache;
|
std::map<pair<wordID_t, wordID_t>, pair<float, float> >::const_iterator itrCache;
|
||||||
// for each source word
|
// for each source word
|
||||||
for(int srcIdx = phrasepair.m_startSource; srcIdx <= phrasepair.m_endSource; ++srcIdx) {
|
for(int srcIdx = phrasepair.m_startSource; srcIdx <= phrasepair.m_endSource; ++srcIdx) {
|
||||||
float srcSumPairProbs(0);
|
float srcSumPairProbs(0);
|
||||||
wordID_t srcWord = m_srcCorpus->at(srcIdx + m_srcSntBreaks[phrasepair.m_sntIndex]); // localIDs
|
wordID_t srcWord = m_srcCorpus->at(srcIdx + m_srcSntBreaks[phrasepair.m_sntIndex]); // localIDs
|
||||||
const std::vector<int>& srcWordAlignments = alignment.alignedList.at(srcIdx);
|
const std::vector<int>& srcWordAlignments = alignment.alignedList.at(srcIdx);
|
||||||
// for each target word aligned to this source word in this alignment
|
// for each target word aligned to this source word in this alignment
|
||||||
if(srcWordAlignments.size() == 0) { // get p(NULL|src)
|
if(srcWordAlignments.size() == 0) { // get p(NULL|src)
|
||||||
pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, m_srcVocab->GetkOOVWordID());
|
pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, m_srcVocab->GetkOOVWordID());
|
||||||
itrCache = m_wordPairCache.find(wordpair);
|
itrCache = m_wordPairCache.find(wordpair);
|
||||||
if(itrCache == m_wordPairCache.end()) { // if not in cache
|
if(itrCache == m_wordPairCache.end()) { // if not in cache
|
||||||
CacheWordProbs(srcWord);
|
CacheWordProbs(srcWord);
|
||||||
itrCache = m_wordPairCache.find(wordpair); // search cache again
|
itrCache = m_wordPairCache.find(wordpair); // search cache again
|
||||||
}
|
}
|
||||||
CHECK(itrCache != m_wordPairCache.end());
|
CHECK(itrCache != m_wordPairCache.end());
|
||||||
srcSumPairProbs += itrCache->second.first;
|
srcSumPairProbs += itrCache->second.first;
|
||||||
targetProbs[wordpair] = itrCache->second.second;
|
targetProbs[wordpair] = itrCache->second.second;
|
||||||
}
|
} else { // extract p(trg|src)
|
||||||
else { // extract p(trg|src)
|
for(size_t i = 0; i < srcWordAlignments.size(); ++i) { // for each aligned word
|
||||||
for(size_t i = 0; i < srcWordAlignments.size(); ++i) { // for each aligned word
|
int trgIdx = srcWordAlignments[i];
|
||||||
int trgIdx = srcWordAlignments[i];
|
wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
|
||||||
wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
|
// get probability of this source->target word pair
|
||||||
// get probability of this source->target word pair
|
pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, trgWord);
|
||||||
pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, trgWord);
|
itrCache = m_wordPairCache.find(wordpair);
|
||||||
itrCache = m_wordPairCache.find(wordpair);
|
if(itrCache == m_wordPairCache.end()) { // if not in cache
|
||||||
if(itrCache == m_wordPairCache.end()) { // if not in cache
|
|
||||||
CacheWordProbs(srcWord);
|
CacheWordProbs(srcWord);
|
||||||
itrCache = m_wordPairCache.find(wordpair); // search cache again
|
itrCache = m_wordPairCache.find(wordpair); // search cache again
|
||||||
}
|
|
||||||
CHECK(itrCache != m_wordPairCache.end());
|
|
||||||
srcSumPairProbs += itrCache->second.first;
|
|
||||||
targetProbs[wordpair] = itrCache->second.second;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
float srcNormalizer = srcWordAlignments.size() < 2 ? 1.0 : 1.0 / float(srcWordAlignments.size());
|
|
||||||
srcLexWeight *= (srcNormalizer * srcSumPairProbs);
|
|
||||||
} // end for each source word
|
|
||||||
for(int trgIdx = phrasepair.m_startTarget; trgIdx <= phrasepair.m_endTarget; ++trgIdx) {
|
|
||||||
float trgSumPairProbs(0);
|
|
||||||
wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
|
|
||||||
for (std::map<pair<wordID_t, wordID_t>, float>::const_iterator trgItr
|
|
||||||
= targetProbs.begin(); trgItr != targetProbs.end(); ++trgItr) {
|
|
||||||
if(trgItr->first.second == trgWord)
|
|
||||||
trgSumPairProbs += trgItr->second;
|
|
||||||
}
|
}
|
||||||
if(trgSumPairProbs == 0) continue; // currently don't store target-side SA
|
CHECK(itrCache != m_wordPairCache.end());
|
||||||
int noAligned = alignment.numberAligned.at(trgIdx);
|
srcSumPairProbs += itrCache->second.first;
|
||||||
float trgNormalizer = noAligned < 2 ? 1.0 : 1.0 / float(noAligned);
|
targetProbs[wordpair] = itrCache->second.second;
|
||||||
trgLexWeight *= (trgNormalizer * trgSumPairProbs);
|
}
|
||||||
}
|
}
|
||||||
// TODO::Need to get p(NULL|trg)
|
float srcNormalizer = srcWordAlignments.size() < 2 ? 1.0 : 1.0 / float(srcWordAlignments.size());
|
||||||
return pair<float, float>(srcLexWeight, trgLexWeight);
|
srcLexWeight *= (srcNormalizer * srcSumPairProbs);
|
||||||
|
} // end for each source word
|
||||||
|
for(int trgIdx = phrasepair.m_startTarget; trgIdx <= phrasepair.m_endTarget; ++trgIdx) {
|
||||||
|
float trgSumPairProbs(0);
|
||||||
|
wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
|
||||||
|
for (std::map<pair<wordID_t, wordID_t>, float>::const_iterator trgItr
|
||||||
|
= targetProbs.begin(); trgItr != targetProbs.end(); ++trgItr) {
|
||||||
|
if(trgItr->first.second == trgWord)
|
||||||
|
trgSumPairProbs += trgItr->second;
|
||||||
|
}
|
||||||
|
if(trgSumPairProbs == 0) continue; // currently don't store target-side SA
|
||||||
|
int noAligned = alignment.numberAligned.at(trgIdx);
|
||||||
|
float trgNormalizer = noAligned < 2 ? 1.0 : 1.0 / float(noAligned);
|
||||||
|
trgLexWeight *= (trgNormalizer * trgSumPairProbs);
|
||||||
|
}
|
||||||
|
// TODO::Need to get p(NULL|trg)
|
||||||
|
return pair<float, float>(srcLexWeight, trgLexWeight);
|
||||||
}
|
}
|
||||||
void BilingualDynSuffixArray::CacheFreqWords() const {
|
void BilingualDynSuffixArray::CacheFreqWords() const
|
||||||
|
{
|
||||||
std::multimap<int, wordID_t> wordCnts;
|
std::multimap<int, wordID_t> wordCnts;
|
||||||
// for each source word in vocab
|
// for each source word in vocab
|
||||||
Vocab::Word2Id::const_iterator it;
|
Vocab::Word2Id::const_iterator it;
|
||||||
for(it = m_srcVocab->VocabStart(); it != m_srcVocab->VocabEnd(); ++it) {
|
for(it = m_srcVocab->VocabStart(); it != m_srcVocab->VocabEnd(); ++it) {
|
||||||
// get its frequency
|
// get its frequency
|
||||||
wordID_t srcWord = it->second;
|
wordID_t srcWord = it->second;
|
||||||
std::vector<wordID_t> sword(1, srcWord), wrdIndices;
|
std::vector<wordID_t> sword(1, srcWord), wrdIndices;
|
||||||
m_srcSA->GetCorpusIndex(&sword, &wrdIndices);
|
m_srcSA->GetCorpusIndex(&sword, &wrdIndices);
|
||||||
if(wrdIndices.size() >= 1000) { // min count
|
if(wrdIndices.size() >= 1000) { // min count
|
||||||
wordCnts.insert(make_pair(wrdIndices.size(), srcWord));
|
wordCnts.insert(make_pair(wrdIndices.size(), srcWord));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
int numSoFar(0);
|
int numSoFar(0);
|
||||||
std::multimap<int, wordID_t>::reverse_iterator ritr;
|
std::multimap<int, wordID_t>::reverse_iterator ritr;
|
||||||
for(ritr = wordCnts.rbegin(); ritr != wordCnts.rend(); ++ritr) {
|
for(ritr = wordCnts.rbegin(); ritr != wordCnts.rend(); ++ritr) {
|
||||||
m_freqWordsCached.insert(ritr->second);
|
m_freqWordsCached.insert(ritr->second);
|
||||||
CacheWordProbs(ritr->second);
|
CacheWordProbs(ritr->second);
|
||||||
if(++numSoFar == 50) break; // get top counts
|
if(++numSoFar == 50) break; // get top counts
|
||||||
}
|
}
|
||||||
cerr << "\tCached " << m_freqWordsCached.size() << " source words\n";
|
cerr << "\tCached " << m_freqWordsCached.size() << " source words\n";
|
||||||
}
|
}
|
||||||
void BilingualDynSuffixArray::CacheWordProbs(wordID_t srcWord) const
|
void BilingualDynSuffixArray::CacheWordProbs(wordID_t srcWord) const
|
||||||
{
|
{
|
||||||
std::map<wordID_t, int> counts;
|
std::map<wordID_t, int> counts;
|
||||||
std::vector<wordID_t> sword(1, srcWord), wrdIndices;
|
std::vector<wordID_t> sword(1, srcWord), wrdIndices;
|
||||||
bool ret = m_srcSA->GetCorpusIndex(&sword, &wrdIndices);
|
bool ret = m_srcSA->GetCorpusIndex(&sword, &wrdIndices);
|
||||||
CHECK(ret);
|
CHECK(ret);
|
||||||
std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, 1, m_srcSntBreaks);
|
std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, 1, m_srcSntBreaks);
|
||||||
float denom(0);
|
float denom(0);
|
||||||
// for each occurrence of this word
|
// for each occurrence of this word
|
||||||
for(size_t snt = 0; snt < sntIndexes.size(); ++snt) {
|
for(size_t snt = 0; snt < sntIndexes.size(); ++snt) {
|
||||||
int sntIdx = sntIndexes.at(snt); // get corpus index for sentence
|
int sntIdx = sntIndexes.at(snt); // get corpus index for sentence
|
||||||
CHECK(sntIdx != -1);
|
CHECK(sntIdx != -1);
|
||||||
int srcWrdSntIdx = wrdIndices.at(snt) - m_srcSntBreaks.at(sntIdx); // get word index in sentence
|
int srcWrdSntIdx = wrdIndices.at(snt) - m_srcSntBreaks.at(sntIdx); // get word index in sentence
|
||||||
const std::vector<int> srcAlg = GetSentenceAlignment(sntIdx).alignedList.at(srcWrdSntIdx); // list of target words for this source word
|
const std::vector<int> srcAlg = GetSentenceAlignment(sntIdx).alignedList.at(srcWrdSntIdx); // list of target words for this source word
|
||||||
if(srcAlg.size() == 0) {
|
if(srcAlg.size() == 0) {
|
||||||
++counts[m_srcVocab->GetkOOVWordID()]; // if not alligned then align to NULL word
|
++counts[m_srcVocab->GetkOOVWordID()]; // if not alligned then align to NULL word
|
||||||
++denom;
|
++denom;
|
||||||
}
|
} else { //get target words aligned to srcword in this sentence
|
||||||
else { //get target words aligned to srcword in this sentence
|
for(size_t i=0; i < srcAlg.size(); ++i) {
|
||||||
for(size_t i=0; i < srcAlg.size(); ++i) {
|
wordID_t trgWord = m_trgCorpus->at(srcAlg[i] + m_trgSntBreaks[sntIdx]);
|
||||||
wordID_t trgWord = m_trgCorpus->at(srcAlg[i] + m_trgSntBreaks[sntIdx]);
|
++counts[trgWord];
|
||||||
++counts[trgWord];
|
++denom;
|
||||||
++denom;
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
// now we've gotten counts of all target words aligned to this source word
|
||||||
// now we've gotten counts of all target words aligned to this source word
|
// get probs and cache all pairs
|
||||||
// get probs and cache all pairs
|
for(std::map<wordID_t, int>::const_iterator itrCnt = counts.begin();
|
||||||
for(std::map<wordID_t, int>::const_iterator itrCnt = counts.begin();
|
itrCnt != counts.end(); ++itrCnt) {
|
||||||
itrCnt != counts.end(); ++itrCnt) {
|
pair<wordID_t, wordID_t> wordPair = make_pair(srcWord, itrCnt->first);
|
||||||
pair<wordID_t, wordID_t> wordPair = make_pair(srcWord, itrCnt->first);
|
float srcTrgPrb = float(itrCnt->second) / float(denom); // gives p(src->trg)
|
||||||
float srcTrgPrb = float(itrCnt->second) / float(denom); // gives p(src->trg)
|
float trgSrcPrb = float(itrCnt->second) / float(counts.size()); // gives p(trg->src)
|
||||||
float trgSrcPrb = float(itrCnt->second) / float(counts.size()); // gives p(trg->src)
|
m_wordPairCache[wordPair] = pair<float, float>(srcTrgPrb, trgSrcPrb);
|
||||||
m_wordPairCache[wordPair] = pair<float, float>(srcTrgPrb, trgSrcPrb);
|
}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
SAPhrase BilingualDynSuffixArray::TrgPhraseFromSntIdx(const PhrasePair& phrasepair) const
|
SAPhrase BilingualDynSuffixArray::TrgPhraseFromSntIdx(const PhrasePair& phrasepair) const
|
||||||
{
|
{
|
||||||
// takes sentence indexes and looks up vocab IDs
|
// takes sentence indexes and looks up vocab IDs
|
||||||
SAPhrase phraseIds(phrasepair.GetTargetSize());
|
SAPhrase phraseIds(phrasepair.GetTargetSize());
|
||||||
int sntIndex = phrasepair.m_sntIndex;
|
int sntIndex = phrasepair.m_sntIndex;
|
||||||
int id(-1), pos(0);
|
int id(-1), pos(0);
|
||||||
for(int i=phrasepair.m_startTarget; i <= phrasepair.m_endTarget; ++i) { // look up trg words
|
for(int i=phrasepair.m_startTarget; i <= phrasepair.m_endTarget; ++i) { // look up trg words
|
||||||
id = m_trgCorpus->at(m_trgSntBreaks[sntIndex] + i);
|
id = m_trgCorpus->at(m_trgSntBreaks[sntIndex] + i);
|
||||||
phraseIds.SetId(pos++, id);
|
phraseIds.SetId(pos++, id);
|
||||||
}
|
}
|
||||||
return phraseIds;
|
return phraseIds;
|
||||||
}
|
|
||||||
|
|
||||||
TargetPhrase* BilingualDynSuffixArray::GetMosesFactorIDs(const SAPhrase& phrase) const
|
|
||||||
{
|
|
||||||
TargetPhrase* targetPhrase = new TargetPhrase(Output);
|
|
||||||
for(size_t i=0; i < phrase.words.size(); ++i) { // look up trg words
|
|
||||||
Word& word = m_trgVocab->GetWord( phrase.words[i]);
|
|
||||||
CHECK(word != m_trgVocab->GetkOOVWord());
|
|
||||||
targetPhrase->AddWord(word);
|
|
||||||
}
|
|
||||||
// scoring
|
|
||||||
return targetPhrase;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void BilingualDynSuffixArray::GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> > & target) const
|
TargetPhrase* BilingualDynSuffixArray::GetMosesFactorIDs(const SAPhrase& phrase) const
|
||||||
|
{
|
||||||
|
TargetPhrase* targetPhrase = new TargetPhrase(Output);
|
||||||
|
for(size_t i=0; i < phrase.words.size(); ++i) { // look up trg words
|
||||||
|
Word& word = m_trgVocab->GetWord( phrase.words[i]);
|
||||||
|
CHECK(word != m_trgVocab->GetkOOVWord());
|
||||||
|
targetPhrase->AddWord(word);
|
||||||
|
}
|
||||||
|
// scoring
|
||||||
|
return targetPhrase;
|
||||||
|
}
|
||||||
|
|
||||||
|
void BilingualDynSuffixArray::GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> > & target) const
|
||||||
{
|
{
|
||||||
//cerr << "phrase is \"" << src << endl;
|
//cerr << "phrase is \"" << src << endl;
|
||||||
size_t sourceSize = src.GetSize();
|
size_t sourceSize = src.GetSize();
|
||||||
SAPhrase localIDs(sourceSize);
|
SAPhrase localIDs(sourceSize);
|
||||||
if(!GetLocalVocabIDs(src, localIDs)) return;
|
if(!GetLocalVocabIDs(src, localIDs)) return;
|
||||||
float totalTrgPhrases(0);
|
float totalTrgPhrases(0);
|
||||||
std::map<SAPhrase, int> phraseCounts;
|
std::map<SAPhrase, int> phraseCounts;
|
||||||
//std::map<SAPhrase, PhrasePair> phraseColl; // (one of) the word indexes this phrase was taken from
|
//std::map<SAPhrase, PhrasePair> phraseColl; // (one of) the word indexes this phrase was taken from
|
||||||
std::map<SAPhrase, pair<float, float> > lexicalWeights;
|
std::map<SAPhrase, pair<float, float> > lexicalWeights;
|
||||||
std::map<SAPhrase, pair<float, float> >::iterator itrLexW;
|
std::map<SAPhrase, pair<float, float> >::iterator itrLexW;
|
||||||
std::vector<unsigned> wrdIndices;
|
std::vector<unsigned> wrdIndices;
|
||||||
// extract sentence IDs from SA and return rightmost index of phrases
|
// extract sentence IDs from SA and return rightmost index of phrases
|
||||||
if(!m_srcSA->GetCorpusIndex(&(localIDs.words), &wrdIndices)) return;
|
if(!m_srcSA->GetCorpusIndex(&(localIDs.words), &wrdIndices)) return;
|
||||||
SampleSelection(wrdIndices);
|
SampleSelection(wrdIndices);
|
||||||
std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, sourceSize, m_srcSntBreaks);
|
std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, sourceSize, m_srcSntBreaks);
|
||||||
// for each sentence with this phrase
|
// for each sentence with this phrase
|
||||||
for(size_t snt = 0; snt < sntIndexes.size(); ++snt) {
|
for(size_t snt = 0; snt < sntIndexes.size(); ++snt) {
|
||||||
std::vector<PhrasePair*> phrasePairs; // to store all phrases possible from current sentence
|
std::vector<PhrasePair*> phrasePairs; // to store all phrases possible from current sentence
|
||||||
int sntIndex = sntIndexes.at(snt); // get corpus index for sentence
|
int sntIndex = sntIndexes.at(snt); // get corpus index for sentence
|
||||||
if(sntIndex == -1) continue; // bad flag set by GetSntIndexes()
|
if(sntIndex == -1) continue; // bad flag set by GetSntIndexes()
|
||||||
ExtractPhrases(sntIndex, wrdIndices[snt], sourceSize, phrasePairs);
|
ExtractPhrases(sntIndex, wrdIndices[snt], sourceSize, phrasePairs);
|
||||||
//cerr << "extracted " << phrasePairs.size() << endl;
|
//cerr << "extracted " << phrasePairs.size() << endl;
|
||||||
totalTrgPhrases += phrasePairs.size(); // keep track of count of each extracted phrase pair
|
totalTrgPhrases += phrasePairs.size(); // keep track of count of each extracted phrase pair
|
||||||
std::vector<PhrasePair*>::iterator iterPhrasePair;
|
std::vector<PhrasePair*>::iterator iterPhrasePair;
|
||||||
for (iterPhrasePair = phrasePairs.begin(); iterPhrasePair != phrasePairs.end(); ++iterPhrasePair) {
|
for (iterPhrasePair = phrasePairs.begin(); iterPhrasePair != phrasePairs.end(); ++iterPhrasePair) {
|
||||||
SAPhrase phrase = TrgPhraseFromSntIdx(**iterPhrasePair);
|
SAPhrase phrase = TrgPhraseFromSntIdx(**iterPhrasePair);
|
||||||
phraseCounts[phrase]++; // count each unique phrase
|
phraseCounts[phrase]++; // count each unique phrase
|
||||||
// NOTE::Correct but slow to extract lexical weight here. could do
|
// NOTE::Correct but slow to extract lexical weight here. could do
|
||||||
// it later for only the top phrases chosen by phrase prob p(e|f)
|
// it later for only the top phrases chosen by phrase prob p(e|f)
|
||||||
pair<float, float> lexWeight = GetLexicalWeight(**iterPhrasePair); // get lexical weighting for this phrase pair
|
pair<float, float> lexWeight = GetLexicalWeight(**iterPhrasePair); // get lexical weighting for this phrase pair
|
||||||
itrLexW = lexicalWeights.find(phrase); // check if phrase already has lexical weight attached
|
itrLexW = lexicalWeights.find(phrase); // check if phrase already has lexical weight attached
|
||||||
if((itrLexW != lexicalWeights.end()) && (itrLexW->second.first < lexWeight.first))
|
if((itrLexW != lexicalWeights.end()) && (itrLexW->second.first < lexWeight.first))
|
||||||
itrLexW->second = lexWeight; // if this lex weight is greater save it
|
itrLexW->second = lexWeight; // if this lex weight is greater save it
|
||||||
else lexicalWeights[phrase] = lexWeight; // else save
|
else lexicalWeights[phrase] = lexWeight; // else save
|
||||||
}
|
}
|
||||||
// done with sentence. delete SA phrase pairs
|
// done with sentence. delete SA phrase pairs
|
||||||
RemoveAllInColl(phrasePairs);
|
RemoveAllInColl(phrasePairs);
|
||||||
} // done with all sentences
|
} // done with all sentences
|
||||||
// convert to moses phrase pairs
|
// convert to moses phrase pairs
|
||||||
std::map<SAPhrase, int>::const_iterator iterPhrases;
|
std::map<SAPhrase, int>::const_iterator iterPhrases;
|
||||||
std::multimap<Scores, const SAPhrase*, ScoresComp> phraseScores (*m_scoreCmp);
|
std::multimap<Scores, const SAPhrase*, ScoresComp> phraseScores (*m_scoreCmp);
|
||||||
// get scores of all phrases
|
// get scores of all phrases
|
||||||
for(iterPhrases = phraseCounts.begin(); iterPhrases != phraseCounts.end(); ++iterPhrases) {
|
for(iterPhrases = phraseCounts.begin(); iterPhrases != phraseCounts.end(); ++iterPhrases) {
|
||||||
float trg2SrcMLE = float(iterPhrases->second) / totalTrgPhrases;
|
float trg2SrcMLE = float(iterPhrases->second) / totalTrgPhrases;
|
||||||
itrLexW = lexicalWeights.find(iterPhrases->first);
|
itrLexW = lexicalWeights.find(iterPhrases->first);
|
||||||
CHECK(itrLexW != lexicalWeights.end());
|
CHECK(itrLexW != lexicalWeights.end());
|
||||||
Scores scoreVector(3);
|
Scores scoreVector(3);
|
||||||
scoreVector[0] = trg2SrcMLE;
|
scoreVector[0] = trg2SrcMLE;
|
||||||
scoreVector[1] = itrLexW->second.first;
|
scoreVector[1] = itrLexW->second.first;
|
||||||
scoreVector[2] = 2.718; // exp(1);
|
scoreVector[2] = 2.718; // exp(1);
|
||||||
phraseScores.insert(make_pair(scoreVector, &iterPhrases->first));
|
phraseScores.insert(make_pair(scoreVector, &iterPhrases->first));
|
||||||
}
|
}
|
||||||
// return top scoring phrases
|
// return top scoring phrases
|
||||||
std::multimap<Scores, const SAPhrase*, ScoresComp>::reverse_iterator ritr;
|
std::multimap<Scores, const SAPhrase*, ScoresComp>::reverse_iterator ritr;
|
||||||
for(ritr = phraseScores.rbegin(); ritr != phraseScores.rend(); ++ritr) {
|
for(ritr = phraseScores.rbegin(); ritr != phraseScores.rend(); ++ritr) {
|
||||||
Scores scoreVector = ritr->first;
|
Scores scoreVector = ritr->first;
|
||||||
TargetPhrase *targetPhrase = GetMosesFactorIDs(*ritr->second);
|
TargetPhrase *targetPhrase = GetMosesFactorIDs(*ritr->second);
|
||||||
target.push_back(make_pair( scoreVector, targetPhrase));
|
target.push_back(make_pair( scoreVector, targetPhrase));
|
||||||
if(target.size() == m_maxSampleSize) break;
|
if(target.size() == m_maxSampleSize) break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<int> BilingualDynSuffixArray::GetSntIndexes(std::vector<unsigned>& wrdIndices,
|
std::vector<int> BilingualDynSuffixArray::GetSntIndexes(std::vector<unsigned>& wrdIndices,
|
||||||
const int sourceSize, const std::vector<unsigned>& sntBreaks) const
|
const int sourceSize, const std::vector<unsigned>& sntBreaks) const
|
||||||
{
|
{
|
||||||
std::vector<unsigned>::const_iterator vit;
|
std::vector<unsigned>::const_iterator vit;
|
||||||
std::vector<int> sntIndexes;
|
std::vector<int> sntIndexes;
|
||||||
for(size_t i=0; i < wrdIndices.size(); ++i) {
|
for(size_t i=0; i < wrdIndices.size(); ++i) {
|
||||||
vit = std::upper_bound(sntBreaks.begin(), sntBreaks.end(), wrdIndices[i]);
|
vit = std::upper_bound(sntBreaks.begin(), sntBreaks.end(), wrdIndices[i]);
|
||||||
int index = int(vit - sntBreaks.begin()) - 1;
|
int index = int(vit - sntBreaks.begin()) - 1;
|
||||||
// check for phrases that cross sentence boundaries
|
// check for phrases that cross sentence boundaries
|
||||||
if(wrdIndices[i] - sourceSize + 1 < sntBreaks.at(index))
|
if(wrdIndices[i] - sourceSize + 1 < sntBreaks.at(index))
|
||||||
sntIndexes.push_back(-1); // set bad flag
|
sntIndexes.push_back(-1); // set bad flag
|
||||||
else
|
else
|
||||||
sntIndexes.push_back(index); // store the index of the sentence in the corpus
|
sntIndexes.push_back(index); // store the index of the sentence in the corpus
|
||||||
}
|
}
|
||||||
return sntIndexes;
|
return sntIndexes;
|
||||||
}
|
}
|
||||||
|
|
||||||
int BilingualDynSuffixArray::SampleSelection(std::vector<unsigned>& sample,
|
int BilingualDynSuffixArray::SampleSelection(std::vector<unsigned>& sample,
|
||||||
int sampleSize) const
|
int sampleSize) const
|
||||||
{
|
{
|
||||||
// only use top 'sampleSize' number of samples
|
// only use top 'sampleSize' number of samples
|
||||||
if(sample.size() > sampleSize)
|
if(sample.size() > sampleSize)
|
||||||
sample.erase(sample.begin()+sampleSize, sample.end());
|
sample.erase(sample.begin()+sampleSize, sample.end());
|
||||||
return sample.size();
|
return sample.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
void BilingualDynSuffixArray::addSntPair(string& source, string& target, string& alignment) {
|
void BilingualDynSuffixArray::addSntPair(string& source, string& target, string& alignment)
|
||||||
|
{
|
||||||
vuint_t srcFactor, trgFactor;
|
vuint_t srcFactor, trgFactor;
|
||||||
cerr << "source, target, alignment = " << source << ", " << target << ", " << alignment << endl;
|
cerr << "source, target, alignment = " << source << ", " << target << ", " << alignment << endl;
|
||||||
const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
|
const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
|
||||||
const unsigned oldSrcCrpSize = m_srcCorpus->size(), oldTrgCrpSize = m_trgCorpus->size();
|
const unsigned oldSrcCrpSize = m_srcCorpus->size(), oldTrgCrpSize = m_trgCorpus->size();
|
||||||
cerr << "old source corpus size = " << oldSrcCrpSize << "\told target size = " << oldTrgCrpSize << endl;
|
cerr << "old source corpus size = " << oldSrcCrpSize << "\told target size = " << oldTrgCrpSize << endl;
|
||||||
Phrase sphrase(ARRAY_SIZE_INCR);
|
Phrase sphrase(ARRAY_SIZE_INCR);
|
||||||
@ -471,7 +470,7 @@ void BilingualDynSuffixArray::addSntPair(string& source, string& target, string&
|
|||||||
cerr << "srcFactor[" << (srcFactor.size() - 1) << "] = " << srcFactor.back() << endl;
|
cerr << "srcFactor[" << (srcFactor.size() - 1) << "] = " << srcFactor.back() << endl;
|
||||||
m_srcCorpus->push_back(srcFactor.back()); // add word to corpus
|
m_srcCorpus->push_back(srcFactor.back()); // add word to corpus
|
||||||
}
|
}
|
||||||
m_srcSntBreaks.push_back(oldSrcCrpSize); // former end of corpus is index of new sentence
|
m_srcSntBreaks.push_back(oldSrcCrpSize); // former end of corpus is index of new sentence
|
||||||
m_srcVocab->MakeClosed();
|
m_srcVocab->MakeClosed();
|
||||||
Phrase tphrase(ARRAY_SIZE_INCR);
|
Phrase tphrase(ARRAY_SIZE_INCR);
|
||||||
tphrase.CreateFromString(m_outputFactors, target, factorDelimiter);
|
tphrase.CreateFromString(m_outputFactors, target, factorDelimiter);
|
||||||
@ -494,16 +493,17 @@ void BilingualDynSuffixArray::addSntPair(string& source, string& target, string&
|
|||||||
LoadRawAlignments(alignment);
|
LoadRawAlignments(alignment);
|
||||||
m_trgVocab->MakeClosed();
|
m_trgVocab->MakeClosed();
|
||||||
//for(size_t i=0; i < sphrase.GetSize(); ++i)
|
//for(size_t i=0; i < sphrase.GetSize(); ++i)
|
||||||
//ClearWordInCache(sIDs[i]);
|
//ClearWordInCache(sIDs[i]);
|
||||||
|
|
||||||
}
|
}
|
||||||
void BilingualDynSuffixArray::ClearWordInCache(wordID_t srcWord) {
|
void BilingualDynSuffixArray::ClearWordInCache(wordID_t srcWord)
|
||||||
|
{
|
||||||
if(m_freqWordsCached.find(srcWord) != m_freqWordsCached.end())
|
if(m_freqWordsCached.find(srcWord) != m_freqWordsCached.end())
|
||||||
return;
|
return;
|
||||||
std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> >::iterator it,
|
std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> >::iterator it,
|
||||||
first, last;
|
first, last;
|
||||||
for(it = m_wordPairCache.begin(); it != m_wordPairCache.end(); ++it) {
|
for(it = m_wordPairCache.begin(); it != m_wordPairCache.end(); ++it) {
|
||||||
if(it->first.first == srcWord) { // all source words grouped
|
if(it->first.first == srcWord) { // all source words grouped
|
||||||
first = it; // copy first entry of srcWord
|
first = it; // copy first entry of srcWord
|
||||||
last = it++;
|
last = it++;
|
||||||
while(it != m_wordPairCache.end() && (it->first.first == srcWord)) {
|
while(it != m_wordPairCache.end() && (it->first.first == srcWord)) {
|
||||||
@ -513,80 +513,77 @@ void BilingualDynSuffixArray::ClearWordInCache(wordID_t srcWord) {
|
|||||||
m_wordPairCache.erase(first, last);
|
m_wordPairCache.erase(first, last);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
SentenceAlignment::SentenceAlignment(int sntIndex, int sourceSize, int targetSize)
|
SentenceAlignment::SentenceAlignment(int sntIndex, int sourceSize, int targetSize)
|
||||||
:m_sntIndex(sntIndex)
|
:m_sntIndex(sntIndex)
|
||||||
,numberAligned(targetSize, 0)
|
,numberAligned(targetSize, 0)
|
||||||
,alignedList(sourceSize)
|
,alignedList(sourceSize)
|
||||||
{
|
{
|
||||||
for(int i=0; i < sourceSize; ++i) {
|
for(int i=0; i < sourceSize; ++i) {
|
||||||
std::vector<int> trgWrd;
|
std::vector<int> trgWrd;
|
||||||
alignedList[i] = trgWrd;
|
alignedList[i] = trgWrd;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool SentenceAlignment::Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret, int startSource, int endSource) const
|
bool SentenceAlignment::Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret, int startSource, int endSource) const
|
||||||
{
|
{
|
||||||
// foreign = target, F=T
|
// foreign = target, F=T
|
||||||
// english = source, E=S
|
// english = source, E=S
|
||||||
int countTarget = numberAligned.size();
|
int countTarget = numberAligned.size();
|
||||||
|
|
||||||
int minTarget = 9999;
|
int minTarget = 9999;
|
||||||
int maxTarget = -1;
|
int maxTarget = -1;
|
||||||
std::vector< int > usedTarget = numberAligned;
|
std::vector< int > usedTarget = numberAligned;
|
||||||
for(int sourcePos = startSource; sourcePos <= endSource; sourcePos++)
|
for(int sourcePos = startSource; sourcePos <= endSource; sourcePos++) {
|
||||||
{
|
for(int ind=0; ind < (int)alignedList[sourcePos].size(); ind++) {
|
||||||
for(int ind=0; ind < (int)alignedList[sourcePos].size();ind++)
|
int targetPos = alignedList[sourcePos][ind];
|
||||||
{
|
// cout << "point (" << targetPos << ", " << sourcePos << ")\n";
|
||||||
int targetPos = alignedList[sourcePos][ind];
|
if (targetPos<minTarget) {
|
||||||
// cout << "point (" << targetPos << ", " << sourcePos << ")\n";
|
minTarget = targetPos;
|
||||||
if (targetPos<minTarget) { minTarget = targetPos; }
|
}
|
||||||
if (targetPos>maxTarget) { maxTarget = targetPos; }
|
if (targetPos>maxTarget) {
|
||||||
usedTarget[ targetPos ]--;
|
maxTarget = targetPos;
|
||||||
} // for(int ind=0;ind<sentence
|
}
|
||||||
} // for(int sourcePos=startSource
|
usedTarget[ targetPos ]--;
|
||||||
|
} // for(int ind=0;ind<sentence
|
||||||
// cout << "f projected ( " << minTarget << "-" << maxTarget << ", " << startSource << "," << endSource << ")\n";
|
} // for(int sourcePos=startSource
|
||||||
|
|
||||||
if (maxTarget >= 0 && // aligned to any foreign words at all
|
// cout << "f projected ( " << minTarget << "-" << maxTarget << ", " << startSource << "," << endSource << ")\n";
|
||||||
maxTarget-minTarget < maxPhraseLength)
|
|
||||||
{ // foreign phrase within limits
|
if (maxTarget >= 0 && // aligned to any foreign words at all
|
||||||
|
maxTarget-minTarget < maxPhraseLength) {
|
||||||
// check if foreign words are aligned to out of bound english words
|
// foreign phrase within limits
|
||||||
bool out_of_bounds = false;
|
|
||||||
for(int targetPos=minTarget; targetPos <= maxTarget && !out_of_bounds; targetPos++)
|
// check if foreign words are aligned to out of bound english words
|
||||||
{
|
bool out_of_bounds = false;
|
||||||
if (usedTarget[targetPos]>0)
|
for(int targetPos=minTarget; targetPos <= maxTarget && !out_of_bounds; targetPos++) {
|
||||||
{
|
if (usedTarget[targetPos]>0) {
|
||||||
// cout << "ouf of bounds: " << targetPos << "\n";
|
// cout << "ouf of bounds: " << targetPos << "\n";
|
||||||
out_of_bounds = true;
|
out_of_bounds = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// cout << "doing if for ( " << minTarget << "-" << maxTarget << ", " << startSource << "," << endSource << ")\n";
|
// cout << "doing if for ( " << minTarget << "-" << maxTarget << ", " << startSource << "," << endSource << ")\n";
|
||||||
if (!out_of_bounds)
|
if (!out_of_bounds) {
|
||||||
{
|
// start point of foreign phrase may retreat over unaligned
|
||||||
// start point of foreign phrase may retreat over unaligned
|
for(int startTarget = minTarget;
|
||||||
for(int startTarget = minTarget;
|
(startTarget >= 0 &&
|
||||||
(startTarget >= 0 &&
|
startTarget > maxTarget-maxPhraseLength && // within length limit
|
||||||
startTarget > maxTarget-maxPhraseLength && // within length limit
|
(startTarget==minTarget || numberAligned[startTarget]==0)); // unaligned
|
||||||
(startTarget==minTarget || numberAligned[startTarget]==0)); // unaligned
|
startTarget--) {
|
||||||
startTarget--)
|
// end point of foreign phrase may advance over unaligned
|
||||||
{
|
for (int endTarget=maxTarget;
|
||||||
// end point of foreign phrase may advance over unaligned
|
(endTarget<countTarget &&
|
||||||
for (int endTarget=maxTarget;
|
endTarget<startTarget+maxPhraseLength && // within length limit
|
||||||
(endTarget<countTarget &&
|
(endTarget==maxTarget || numberAligned[endTarget]==0)); // unaligned
|
||||||
endTarget<startTarget+maxPhraseLength && // within length limit
|
endTarget++) {
|
||||||
(endTarget==maxTarget || numberAligned[endTarget]==0)); // unaligned
|
PhrasePair *phrasePair = new PhrasePair(startTarget,endTarget,startSource,endSource, m_sntIndex);
|
||||||
endTarget++)
|
ret.push_back(phrasePair);
|
||||||
{
|
} // for (int endTarget=maxTarget;
|
||||||
PhrasePair *phrasePair = new PhrasePair(startTarget,endTarget,startSource,endSource, m_sntIndex);
|
} // for(int startTarget=minTarget;
|
||||||
ret.push_back(phrasePair);
|
} // if (!out_of_bounds)
|
||||||
} // for (int endTarget=maxTarget;
|
} // if (maxTarget >= 0 &&
|
||||||
} // for(int startTarget=minTarget;
|
return (ret.size() > 0);
|
||||||
} // if (!out_of_bounds)
|
|
||||||
} // if (maxTarget >= 0 &&
|
|
||||||
return (ret.size() > 0);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}// end namepsace
|
}// end namepsace
|
||||||
|
@ -2,70 +2,73 @@
|
|||||||
#define moses_BilingualDynSuffixArray_h
|
#define moses_BilingualDynSuffixArray_h
|
||||||
|
|
||||||
#include "TargetPhrase.h"
|
#include "TargetPhrase.h"
|
||||||
#include "DynSuffixArray.h"
|
#include "DynSuffixArray.h"
|
||||||
#include "DynSAInclude/vocab.h"
|
#include "DynSAInclude/vocab.h"
|
||||||
#include "DynSAInclude/types.h"
|
#include "DynSAInclude/types.h"
|
||||||
#include "DynSAInclude/utils.h"
|
#include "DynSAInclude/utils.h"
|
||||||
#include "InputFileStream.h"
|
#include "InputFileStream.h"
|
||||||
#include "FactorTypeSet.h"
|
#include "FactorTypeSet.h"
|
||||||
|
|
||||||
namespace Moses {
|
namespace Moses
|
||||||
|
{
|
||||||
|
|
||||||
class SAPhrase
|
class SAPhrase
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
std::vector<wordID_t> words;
|
std::vector<wordID_t> words;
|
||||||
|
|
||||||
SAPhrase(size_t phraseSize)
|
SAPhrase(size_t phraseSize)
|
||||||
:words(phraseSize)
|
:words(phraseSize)
|
||||||
{}
|
{}
|
||||||
|
|
||||||
void SetId(size_t pos, wordID_t id)
|
void SetId(size_t pos, wordID_t id) {
|
||||||
{
|
|
||||||
CHECK(pos < words.size());
|
CHECK(pos < words.size());
|
||||||
words[pos] = id;
|
words[pos] = id;
|
||||||
}
|
}
|
||||||
bool operator<(const SAPhrase& phr2) const
|
bool operator<(const SAPhrase& phr2) const {
|
||||||
{ return words < phr2.words; }
|
return words < phr2.words;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
class PhrasePair
|
class PhrasePair
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
int m_startTarget, m_endTarget, m_startSource, m_endSource, m_sntIndex;
|
int m_startTarget, m_endTarget, m_startSource, m_endSource, m_sntIndex;
|
||||||
PhrasePair(int startTarget, int endTarget, int startSource, int endSource, int sntIndex)
|
PhrasePair(int startTarget, int endTarget, int startSource, int endSource, int sntIndex)
|
||||||
: m_startTarget(startTarget)
|
: m_startTarget(startTarget)
|
||||||
, m_endTarget(endTarget)
|
, m_endTarget(endTarget)
|
||||||
, m_startSource(startSource)
|
, m_startSource(startSource)
|
||||||
, m_endSource(endSource)
|
, m_endSource(endSource)
|
||||||
, m_sntIndex(sntIndex)
|
, m_sntIndex(sntIndex)
|
||||||
{}
|
{}
|
||||||
|
|
||||||
size_t GetTargetSize() const
|
size_t GetTargetSize() const {
|
||||||
{ return m_endTarget - m_startTarget + 1; }
|
return m_endTarget - m_startTarget + 1;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
class SentenceAlignment
|
class SentenceAlignment
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
SentenceAlignment(int sntIndex, int sourceSize, int targetSize);
|
SentenceAlignment(int sntIndex, int sourceSize, int targetSize);
|
||||||
int m_sntIndex;
|
int m_sntIndex;
|
||||||
std::vector<wordID_t>* trgSnt;
|
std::vector<wordID_t>* trgSnt;
|
||||||
std::vector<wordID_t>* srcSnt;
|
std::vector<wordID_t>* srcSnt;
|
||||||
std::vector<int> numberAligned;
|
std::vector<int> numberAligned;
|
||||||
std::vector< std::vector<int> > alignedList;
|
std::vector< std::vector<int> > alignedList;
|
||||||
bool Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret, int startSource, int endSource) const;
|
bool Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret, int startSource, int endSource) const;
|
||||||
};
|
};
|
||||||
class ScoresComp {
|
class ScoresComp
|
||||||
public:
|
{
|
||||||
|
public:
|
||||||
ScoresComp(const std::vector<float>& weights): m_weights(weights) {}
|
ScoresComp(const std::vector<float>& weights): m_weights(weights) {}
|
||||||
bool operator()(const Scores& s1, const Scores& s2) const {
|
bool operator()(const Scores& s1, const Scores& s2) const {
|
||||||
return s1[0] < s2[0]; // just p(e|f) as approximation
|
return s1[0] < s2[0]; // just p(e|f) as approximation
|
||||||
/*float score1(0), score2(0);
|
/*float score1(0), score2(0);
|
||||||
int idx1(0), idx2(0);
|
int idx1(0), idx2(0);
|
||||||
for (Scores::const_iterator itr = s1.begin();
|
for (Scores::const_iterator itr = s1.begin();
|
||||||
itr != s1.end(); ++itr) {
|
itr != s1.end(); ++itr) {
|
||||||
score1 += log(*itr * m_weights.at(idx1++));
|
score1 += log(*itr * m_weights.at(idx1++));
|
||||||
}
|
}
|
||||||
for (Scores::const_iterator itr = s2.begin();
|
for (Scores::const_iterator itr = s2.begin();
|
||||||
itr != s2.end(); ++itr) {
|
itr != s2.end(); ++itr) {
|
||||||
@ -73,73 +76,72 @@ public:
|
|||||||
}
|
}
|
||||||
return score1 < score2;*/
|
return score1 < score2;*/
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
const std::vector<float>& m_weights;
|
const std::vector<float>& m_weights;
|
||||||
};
|
};
|
||||||
|
|
||||||
class BilingualDynSuffixArray {
|
class BilingualDynSuffixArray
|
||||||
public:
|
{
|
||||||
BilingualDynSuffixArray();
|
public:
|
||||||
~BilingualDynSuffixArray();
|
BilingualDynSuffixArray();
|
||||||
bool Load( const std::vector<FactorType>& inputFactors,
|
~BilingualDynSuffixArray();
|
||||||
const std::vector<FactorType>& outputTactors,
|
bool Load( const std::vector<FactorType>& inputFactors,
|
||||||
std::string source, std::string target, std::string alignments,
|
const std::vector<FactorType>& outputTactors,
|
||||||
const std::vector<float> &weight);
|
std::string source, std::string target, std::string alignments,
|
||||||
void GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> >& target) const;
|
const std::vector<float> &weight);
|
||||||
void CleanUp();
|
void GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> >& target) const;
|
||||||
|
void CleanUp();
|
||||||
void addSntPair(string& source, string& target, string& alignment);
|
void addSntPair(string& source, string& target, string& alignment);
|
||||||
private:
|
private:
|
||||||
DynSuffixArray* m_srcSA;
|
DynSuffixArray* m_srcSA;
|
||||||
DynSuffixArray* m_trgSA;
|
DynSuffixArray* m_trgSA;
|
||||||
std::vector<wordID_t>* m_srcCorpus;
|
std::vector<wordID_t>* m_srcCorpus;
|
||||||
std::vector<wordID_t>* m_trgCorpus;
|
std::vector<wordID_t>* m_trgCorpus;
|
||||||
std::vector<FactorType> m_inputFactors;
|
std::vector<FactorType> m_inputFactors;
|
||||||
std::vector<FactorType> m_outputFactors;
|
std::vector<FactorType> m_outputFactors;
|
||||||
|
|
||||||
std::vector<unsigned> m_srcSntBreaks, m_trgSntBreaks;
|
std::vector<unsigned> m_srcSntBreaks, m_trgSntBreaks;
|
||||||
|
|
||||||
Vocab* m_srcVocab, *m_trgVocab;
|
Vocab* m_srcVocab, *m_trgVocab;
|
||||||
ScoresComp* m_scoreCmp;
|
ScoresComp* m_scoreCmp;
|
||||||
|
|
||||||
std::vector<SentenceAlignment> m_alignments;
|
std::vector<SentenceAlignment> m_alignments;
|
||||||
std::vector<std::vector<short> > m_rawAlignments;
|
std::vector<std::vector<short> > m_rawAlignments;
|
||||||
|
|
||||||
mutable std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> > m_wordPairCache;
|
mutable std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> > m_wordPairCache;
|
||||||
mutable std::set<wordID_t> m_freqWordsCached;
|
mutable std::set<wordID_t> m_freqWordsCached;
|
||||||
const size_t m_maxPhraseLength, m_maxSampleSize;
|
const size_t m_maxPhraseLength, m_maxSampleSize;
|
||||||
|
|
||||||
int LoadCorpus(InputFileStream&, const std::vector<FactorType>& factors,
|
int LoadCorpus(InputFileStream&, const std::vector<FactorType>& factors,
|
||||||
std::vector<wordID_t>&, std::vector<wordID_t>&,
|
std::vector<wordID_t>&, std::vector<wordID_t>&,
|
||||||
Vocab*);
|
Vocab*);
|
||||||
int LoadAlignments(InputFileStream& aligs);
|
int LoadAlignments(InputFileStream& aligs);
|
||||||
int LoadRawAlignments(InputFileStream& aligs);
|
int LoadRawAlignments(InputFileStream& aligs);
|
||||||
int LoadRawAlignments(string& aligs);
|
int LoadRawAlignments(string& aligs);
|
||||||
|
|
||||||
bool ExtractPhrases(const int&, const int&, const int&, std::vector<PhrasePair*>&, bool=false) const;
|
bool ExtractPhrases(const int&, const int&, const int&, std::vector<PhrasePair*>&, bool=false) const;
|
||||||
SentenceAlignment GetSentenceAlignment(const int, bool=false) const;
|
SentenceAlignment GetSentenceAlignment(const int, bool=false) const;
|
||||||
int SampleSelection(std::vector<unsigned>&, int = 300) const;
|
int SampleSelection(std::vector<unsigned>&, int = 300) const;
|
||||||
|
|
||||||
std::vector<int> GetSntIndexes(std::vector<unsigned>&, int, const std::vector<unsigned>&) const;
|
std::vector<int> GetSntIndexes(std::vector<unsigned>&, int, const std::vector<unsigned>&) const;
|
||||||
TargetPhrase* GetMosesFactorIDs(const SAPhrase&) const;
|
TargetPhrase* GetMosesFactorIDs(const SAPhrase&) const;
|
||||||
SAPhrase TrgPhraseFromSntIdx(const PhrasePair&) const;
|
SAPhrase TrgPhraseFromSntIdx(const PhrasePair&) const;
|
||||||
bool GetLocalVocabIDs(const Phrase&, SAPhrase &) const;
|
bool GetLocalVocabIDs(const Phrase&, SAPhrase &) const;
|
||||||
void CacheWordProbs(wordID_t) const;
|
void CacheWordProbs(wordID_t) const;
|
||||||
void CacheFreqWords() const;
|
void CacheFreqWords() const;
|
||||||
void ClearWordInCache(wordID_t);
|
void ClearWordInCache(wordID_t);
|
||||||
std::pair<float, float> GetLexicalWeight(const PhrasePair&) const;
|
std::pair<float, float> GetLexicalWeight(const PhrasePair&) const;
|
||||||
|
|
||||||
int GetSourceSentenceSize(size_t sentenceId) const
|
int GetSourceSentenceSize(size_t sentenceId) const {
|
||||||
{
|
return (sentenceId==m_srcSntBreaks.size()-1) ?
|
||||||
return (sentenceId==m_srcSntBreaks.size()-1) ?
|
m_srcCorpus->size() - m_srcSntBreaks.at(sentenceId) :
|
||||||
m_srcCorpus->size() - m_srcSntBreaks.at(sentenceId) :
|
m_srcSntBreaks.at(sentenceId+1) - m_srcSntBreaks.at(sentenceId);
|
||||||
m_srcSntBreaks.at(sentenceId+1) - m_srcSntBreaks.at(sentenceId);
|
}
|
||||||
}
|
int GetTargetSentenceSize(size_t sentenceId) const {
|
||||||
int GetTargetSentenceSize(size_t sentenceId) const
|
return (sentenceId==m_trgSntBreaks.size()-1) ?
|
||||||
{
|
m_trgCorpus->size() - m_trgSntBreaks.at(sentenceId) :
|
||||||
return (sentenceId==m_trgSntBreaks.size()-1) ?
|
m_trgSntBreaks.at(sentenceId+1) - m_trgSntBreaks.at(sentenceId);
|
||||||
m_trgCorpus->size() - m_trgSntBreaks.at(sentenceId) :
|
}
|
||||||
m_trgSntBreaks.at(sentenceId+1) - m_trgSntBreaks.at(sentenceId);
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
} // end namespace
|
} // end namespace
|
||||||
#endif
|
#endif
|
||||||
|
@ -98,8 +98,7 @@ void ChartCell::ProcessSentence(const ChartTranslationOptionList &transOptList
|
|||||||
|
|
||||||
// add all trans opt into queue. using only 1st child node.
|
// add all trans opt into queue. using only 1st child node.
|
||||||
ChartTranslationOptionList::const_iterator iterList;
|
ChartTranslationOptionList::const_iterator iterList;
|
||||||
for (iterList = transOptList.begin(); iterList != transOptList.end(); ++iterList)
|
for (iterList = transOptList.begin(); iterList != transOptList.end(); ++iterList) {
|
||||||
{
|
|
||||||
const ChartTranslationOption &transOpt = **iterList;
|
const ChartTranslationOption &transOpt = **iterList;
|
||||||
RuleCube *ruleCube = new RuleCube(transOpt, allChartCells, m_manager);
|
RuleCube *ruleCube = new RuleCube(transOpt, allChartCells, m_manager);
|
||||||
queue.Add(ruleCube);
|
queue.Add(ruleCube);
|
||||||
@ -107,8 +106,7 @@ void ChartCell::ProcessSentence(const ChartTranslationOptionList &transOptList
|
|||||||
|
|
||||||
// pluck things out of queue and add to hypo collection
|
// pluck things out of queue and add to hypo collection
|
||||||
const size_t popLimit = staticData.GetCubePruningPopLimit();
|
const size_t popLimit = staticData.GetCubePruningPopLimit();
|
||||||
for (size_t numPops = 0; numPops < popLimit && !queue.IsEmpty(); ++numPops)
|
for (size_t numPops = 0; numPops < popLimit && !queue.IsEmpty(); ++numPops) {
|
||||||
{
|
|
||||||
ChartHypothesis *hypo = queue.Pop();
|
ChartHypothesis *hypo = queue.Pop();
|
||||||
AddHypothesis(hypo);
|
AddHypothesis(hypo);
|
||||||
}
|
}
|
||||||
|
@ -34,7 +34,7 @@ class Word;
|
|||||||
|
|
||||||
class ChartCellLabel
|
class ChartCellLabel
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
ChartCellLabel(const WordsRange &coverage, const Word &label,
|
ChartCellLabel(const WordsRange &coverage, const Word &label,
|
||||||
const ChartHypothesisCollection *stack=NULL)
|
const ChartHypothesisCollection *stack=NULL)
|
||||||
: m_coverage(coverage)
|
: m_coverage(coverage)
|
||||||
@ -42,12 +42,17 @@ class ChartCellLabel
|
|||||||
, m_stack(stack)
|
, m_stack(stack)
|
||||||
{}
|
{}
|
||||||
|
|
||||||
const WordsRange &GetCoverage() const { return m_coverage; }
|
const WordsRange &GetCoverage() const {
|
||||||
const Word &GetLabel() const { return m_label; }
|
return m_coverage;
|
||||||
const ChartHypothesisCollection *GetStack() const { return m_stack; }
|
}
|
||||||
|
const Word &GetLabel() const {
|
||||||
|
return m_label;
|
||||||
|
}
|
||||||
|
const ChartHypothesisCollection *GetStack() const {
|
||||||
|
return m_stack;
|
||||||
|
}
|
||||||
|
|
||||||
bool operator<(const ChartCellLabel &other) const
|
bool operator<(const ChartCellLabel &other) const {
|
||||||
{
|
|
||||||
// m_coverage and m_label uniquely identify a ChartCellLabel, so don't
|
// m_coverage and m_label uniquely identify a ChartCellLabel, so don't
|
||||||
// need to compare m_stack.
|
// need to compare m_stack.
|
||||||
if (m_coverage == other.m_coverage) {
|
if (m_coverage == other.m_coverage) {
|
||||||
@ -56,7 +61,7 @@ class ChartCellLabel
|
|||||||
return m_coverage < other.m_coverage;
|
return m_coverage < other.m_coverage;
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
const WordsRange &m_coverage;
|
const WordsRange &m_coverage;
|
||||||
const Word &m_label;
|
const Word &m_label;
|
||||||
const ChartHypothesisCollection *m_stack;
|
const ChartHypothesisCollection *m_stack;
|
||||||
|
@ -34,40 +34,45 @@ class ChartHypothesisCollection;
|
|||||||
|
|
||||||
class ChartCellLabelSet
|
class ChartCellLabelSet
|
||||||
{
|
{
|
||||||
private:
|
private:
|
||||||
typedef std::set<ChartCellLabel> SetType;
|
typedef std::set<ChartCellLabel> SetType;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
typedef SetType::const_iterator const_iterator;
|
typedef SetType::const_iterator const_iterator;
|
||||||
|
|
||||||
ChartCellLabelSet(const WordsRange &coverage) : m_coverage(coverage) {}
|
ChartCellLabelSet(const WordsRange &coverage) : m_coverage(coverage) {}
|
||||||
|
|
||||||
const_iterator begin() const { return m_set.begin(); }
|
const_iterator begin() const {
|
||||||
const_iterator end() const { return m_set.end(); }
|
return m_set.begin();
|
||||||
|
}
|
||||||
|
const_iterator end() const {
|
||||||
|
return m_set.end();
|
||||||
|
}
|
||||||
|
|
||||||
void AddWord(const Word &w)
|
void AddWord(const Word &w) {
|
||||||
{
|
|
||||||
ChartCellLabel cellLabel(m_coverage, w);
|
ChartCellLabel cellLabel(m_coverage, w);
|
||||||
m_set.insert(cellLabel);
|
m_set.insert(cellLabel);
|
||||||
}
|
}
|
||||||
|
|
||||||
void AddConstituent(const Word &w, const ChartHypothesisCollection &stack)
|
void AddConstituent(const Word &w, const ChartHypothesisCollection &stack) {
|
||||||
{
|
|
||||||
ChartCellLabel cellLabel(m_coverage, w, &stack);
|
ChartCellLabel cellLabel(m_coverage, w, &stack);
|
||||||
m_set.insert(cellLabel);
|
m_set.insert(cellLabel);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Empty() const { return m_set.empty(); }
|
bool Empty() const {
|
||||||
|
return m_set.empty();
|
||||||
|
}
|
||||||
|
|
||||||
size_t GetSize() const { return m_set.size(); }
|
size_t GetSize() const {
|
||||||
|
return m_set.size();
|
||||||
|
}
|
||||||
|
|
||||||
const ChartCellLabel *Find(const Word &w) const
|
const ChartCellLabel *Find(const Word &w) const {
|
||||||
{
|
|
||||||
SetType::const_iterator p = m_set.find(ChartCellLabel(m_coverage, w));
|
SetType::const_iterator p = m_set.find(ChartCellLabel(m_coverage, w));
|
||||||
return p == m_set.end() ? 0 : &(*p);
|
return p == m_set.end() ? 0 : &(*p);
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
const WordsRange &m_coverage;
|
const WordsRange &m_coverage;
|
||||||
SetType m_set;
|
SetType m_set;
|
||||||
};
|
};
|
||||||
|
@ -57,15 +57,14 @@ ChartHypothesis::ChartHypothesis(const ChartTranslationOption &transOpt,
|
|||||||
const std::vector<HypothesisDimension> &childEntries = item.GetHypothesisDimensions();
|
const std::vector<HypothesisDimension> &childEntries = item.GetHypothesisDimensions();
|
||||||
m_prevHypos.reserve(childEntries.size());
|
m_prevHypos.reserve(childEntries.size());
|
||||||
std::vector<HypothesisDimension>::const_iterator iter;
|
std::vector<HypothesisDimension>::const_iterator iter;
|
||||||
for (iter = childEntries.begin(); iter != childEntries.end(); ++iter)
|
for (iter = childEntries.begin(); iter != childEntries.end(); ++iter) {
|
||||||
{
|
|
||||||
m_prevHypos.push_back(iter->GetHypothesis());
|
m_prevHypos.push_back(iter->GetHypothesis());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ChartHypothesis::~ChartHypothesis()
|
ChartHypothesis::~ChartHypothesis()
|
||||||
{
|
{
|
||||||
// delete feature function states
|
// delete feature function states
|
||||||
for (unsigned i = 0; i < m_ffStates.size(); ++i) {
|
for (unsigned i = 0; i < m_ffStates.size(); ++i) {
|
||||||
delete m_ffStates[i];
|
delete m_ffStates[i];
|
||||||
}
|
}
|
||||||
@ -98,8 +97,7 @@ void ChartHypothesis::CreateOutputPhrase(Phrase &outPhrase) const
|
|||||||
size_t nonTermInd = nonTermIndexMap[pos];
|
size_t nonTermInd = nonTermIndexMap[pos];
|
||||||
const ChartHypothesis *prevHypo = m_prevHypos[nonTermInd];
|
const ChartHypothesis *prevHypo = m_prevHypos[nonTermInd];
|
||||||
prevHypo->CreateOutputPhrase(outPhrase);
|
prevHypo->CreateOutputPhrase(outPhrase);
|
||||||
}
|
} else {
|
||||||
else {
|
|
||||||
outPhrase.AddWord(word);
|
outPhrase.AddWord(word);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -120,20 +118,19 @@ Phrase ChartHypothesis::GetOutputPhrase() const
|
|||||||
*/
|
*/
|
||||||
int ChartHypothesis::RecombineCompare(const ChartHypothesis &compare) const
|
int ChartHypothesis::RecombineCompare(const ChartHypothesis &compare) const
|
||||||
{
|
{
|
||||||
int comp = 0;
|
int comp = 0;
|
||||||
// -1 = this < compare
|
// -1 = this < compare
|
||||||
// +1 = this > compare
|
// +1 = this > compare
|
||||||
// 0 = this ==compare
|
// 0 = this ==compare
|
||||||
|
|
||||||
for (unsigned i = 0; i < m_ffStates.size(); ++i)
|
for (unsigned i = 0; i < m_ffStates.size(); ++i) {
|
||||||
{
|
if (m_ffStates[i] == NULL || compare.m_ffStates[i] == NULL)
|
||||||
if (m_ffStates[i] == NULL || compare.m_ffStates[i] == NULL)
|
|
||||||
comp = m_ffStates[i] - compare.m_ffStates[i];
|
comp = m_ffStates[i] - compare.m_ffStates[i];
|
||||||
else
|
else
|
||||||
comp = m_ffStates[i]->Compare(*compare.m_ffStates[i]);
|
comp = m_ffStates[i]->Compare(*compare.m_ffStates[i]);
|
||||||
|
|
||||||
if (comp != 0)
|
if (comp != 0)
|
||||||
return comp;
|
return comp;
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
@ -154,12 +151,12 @@ void ChartHypothesis::CalcScore()
|
|||||||
const ScoreComponentCollection &scoreBreakdown = GetCurrTargetPhrase().GetScoreBreakdown();
|
const ScoreComponentCollection &scoreBreakdown = GetCurrTargetPhrase().GetScoreBreakdown();
|
||||||
m_scoreBreakdown.PlusEquals(scoreBreakdown);
|
m_scoreBreakdown.PlusEquals(scoreBreakdown);
|
||||||
|
|
||||||
// compute values of stateless feature functions that were not
|
// compute values of stateless feature functions that were not
|
||||||
// cached in the translation option-- there is no principled distinction
|
// cached in the translation option-- there is no principled distinction
|
||||||
|
|
||||||
//const vector<const StatelessFeatureFunction*>& sfs =
|
//const vector<const StatelessFeatureFunction*>& sfs =
|
||||||
// m_manager.GetTranslationSystem()->GetStatelessFeatureFunctions();
|
// m_manager.GetTranslationSystem()->GetStatelessFeatureFunctions();
|
||||||
// TODO!
|
// TODO!
|
||||||
//for (unsigned i = 0; i < sfs.size(); ++i) {
|
//for (unsigned i = 0; i < sfs.size(); ++i) {
|
||||||
// sfs[i]->ChartEvaluate(m_targetPhrase, &m_scoreBreakdown);
|
// sfs[i]->ChartEvaluate(m_targetPhrase, &m_scoreBreakdown);
|
||||||
//}
|
//}
|
||||||
@ -167,7 +164,7 @@ void ChartHypothesis::CalcScore()
|
|||||||
const std::vector<const StatefulFeatureFunction*>& ffs =
|
const std::vector<const StatefulFeatureFunction*>& ffs =
|
||||||
m_manager.GetTranslationSystem()->GetStatefulFeatureFunctions();
|
m_manager.GetTranslationSystem()->GetStatefulFeatureFunctions();
|
||||||
for (unsigned i = 0; i < ffs.size(); ++i) {
|
for (unsigned i = 0; i < ffs.size(); ++i) {
|
||||||
m_ffStates[i] = ffs[i]->EvaluateChart(*this,i,&m_scoreBreakdown);
|
m_ffStates[i] = ffs[i]->EvaluateChart(*this,i,&m_scoreBreakdown);
|
||||||
}
|
}
|
||||||
|
|
||||||
m_totalScore = m_scoreBreakdown.GetWeightedScore();
|
m_totalScore = m_scoreBreakdown.GetWeightedScore();
|
||||||
@ -258,13 +255,12 @@ std::ostream& operator<<(std::ostream& out, const ChartHypothesis& hypo)
|
|||||||
{
|
{
|
||||||
|
|
||||||
out << hypo.GetId();
|
out << hypo.GetId();
|
||||||
|
|
||||||
// recombination
|
// recombination
|
||||||
if (hypo.GetWinningHypothesis() != NULL &&
|
if (hypo.GetWinningHypothesis() != NULL &&
|
||||||
hypo.GetWinningHypothesis() != &hypo)
|
hypo.GetWinningHypothesis() != &hypo) {
|
||||||
{
|
out << "->" << hypo.GetWinningHypothesis()->GetId();
|
||||||
out << "->" << hypo.GetWinningHypothesis()->GetId();
|
}
|
||||||
}
|
|
||||||
|
|
||||||
out << " " << hypo.GetCurrTargetPhrase()
|
out << " " << hypo.GetCurrTargetPhrase()
|
||||||
//<< " " << outPhrase
|
//<< " " << outPhrase
|
||||||
|
@ -55,7 +55,7 @@ protected:
|
|||||||
const ChartTranslationOption &m_transOpt;
|
const ChartTranslationOption &m_transOpt;
|
||||||
|
|
||||||
WordsRange m_currSourceWordsRange;
|
WordsRange m_currSourceWordsRange;
|
||||||
std::vector<const FFState*> m_ffStates; /*! stateful feature function states */
|
std::vector<const FFState*> m_ffStates; /*! stateful feature function states */
|
||||||
ScoreComponentCollection m_scoreBreakdown /*! detailed score break-down by components (for instance language model, word penalty, etc) */
|
ScoreComponentCollection m_scoreBreakdown /*! detailed score break-down by components (for instance language model, word penalty, etc) */
|
||||||
,m_lmNGram
|
,m_lmNGram
|
||||||
,m_lmPrefix;
|
,m_lmPrefix;
|
||||||
@ -94,7 +94,9 @@ public:
|
|||||||
|
|
||||||
~ChartHypothesis();
|
~ChartHypothesis();
|
||||||
|
|
||||||
unsigned GetId() const { return m_id; }
|
unsigned GetId() const {
|
||||||
|
return m_id;
|
||||||
|
}
|
||||||
|
|
||||||
const ChartTranslationOption &GetTranslationOption()const {
|
const ChartTranslationOption &GetTranslationOption()const {
|
||||||
return m_transOpt;
|
return m_transOpt;
|
||||||
@ -108,15 +110,17 @@ public:
|
|||||||
inline const ChartArcList* GetArcList() const {
|
inline const ChartArcList* GetArcList() const {
|
||||||
return m_arcList;
|
return m_arcList;
|
||||||
}
|
}
|
||||||
inline const FFState* GetFFState( size_t featureID ) const {
|
inline const FFState* GetFFState( size_t featureID ) const {
|
||||||
return m_ffStates[ featureID ];
|
return m_ffStates[ featureID ];
|
||||||
}
|
}
|
||||||
inline const ChartManager& GetManager() const { return m_manager; }
|
inline const ChartManager& GetManager() const {
|
||||||
|
return m_manager;
|
||||||
|
}
|
||||||
|
|
||||||
void CreateOutputPhrase(Phrase &outPhrase) const;
|
void CreateOutputPhrase(Phrase &outPhrase) const;
|
||||||
Phrase GetOutputPhrase() const;
|
Phrase GetOutputPhrase() const;
|
||||||
|
|
||||||
int RecombineCompare(const ChartHypothesis &compare) const;
|
int RecombineCompare(const ChartHypothesis &compare) const;
|
||||||
|
|
||||||
void CalcScore();
|
void CalcScore();
|
||||||
|
|
||||||
@ -135,17 +139,17 @@ public:
|
|||||||
return m_prevHypos;
|
return m_prevHypos;
|
||||||
}
|
}
|
||||||
|
|
||||||
const ChartHypothesis* GetPrevHypo(size_t pos) const {
|
const ChartHypothesis* GetPrevHypo(size_t pos) const {
|
||||||
return m_prevHypos[pos];
|
return m_prevHypos[pos];
|
||||||
}
|
}
|
||||||
|
|
||||||
const Word &GetTargetLHS() const {
|
const Word &GetTargetLHS() const {
|
||||||
return GetCurrTargetPhrase().GetTargetLHS();
|
return GetCurrTargetPhrase().GetTargetLHS();
|
||||||
}
|
}
|
||||||
|
|
||||||
const ChartHypothesis* GetWinningHypothesis() const {
|
const ChartHypothesis* GetWinningHypothesis() const {
|
||||||
return m_winningHypo;
|
return m_winningHypo;
|
||||||
}
|
}
|
||||||
|
|
||||||
TO_STRING();
|
TO_STRING();
|
||||||
|
|
||||||
|
@ -101,8 +101,7 @@ bool ChartHypothesisCollection::AddHypothesis(ChartHypothesis *hypo, ChartManage
|
|||||||
VERBOSE(3,"worse than matching hyp " << hypoExisting->GetId() << ", recombining" << std::endl)
|
VERBOSE(3,"worse than matching hyp " << hypoExisting->GetId() << ", recombining" << std::endl)
|
||||||
if (m_nBestIsEnabled) {
|
if (m_nBestIsEnabled) {
|
||||||
hypoExisting->AddArc(hypo);
|
hypoExisting->AddArc(hypo);
|
||||||
}
|
} else {
|
||||||
else {
|
|
||||||
ChartHypothesis::Delete(hypo);
|
ChartHypothesis::Delete(hypo);
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
|
@ -43,7 +43,7 @@ public:
|
|||||||
bool operator()(const ChartHypothesis* hypoA, const ChartHypothesis* hypoB) const {
|
bool operator()(const ChartHypothesis* hypoA, const ChartHypothesis* hypoB) const {
|
||||||
// assert in same cell
|
// assert in same cell
|
||||||
const WordsRange &rangeA = hypoA->GetCurrSourceRange()
|
const WordsRange &rangeA = hypoA->GetCurrSourceRange()
|
||||||
, &rangeB = hypoB->GetCurrSourceRange();
|
, &rangeB = hypoB->GetCurrSourceRange();
|
||||||
CHECK(rangeA == rangeB);
|
CHECK(rangeA == rangeB);
|
||||||
|
|
||||||
// shouldn't be mixing hypos with different lhs
|
// shouldn't be mixing hypos with different lhs
|
||||||
@ -113,7 +113,9 @@ public:
|
|||||||
return m_hyposOrdered;
|
return m_hyposOrdered;
|
||||||
}
|
}
|
||||||
|
|
||||||
float GetBestScore() const { return m_bestScore; }
|
float GetBestScore() const {
|
||||||
|
return m_bestScore;
|
||||||
|
}
|
||||||
|
|
||||||
void GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream, const std::map<unsigned,bool> &reachable) const;
|
void GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream, const std::map<unsigned,bool> &reachable) const;
|
||||||
|
|
||||||
|
@ -231,17 +231,17 @@ void ChartManager::GetSearchGraph(long translationId, std::ostream &outputSearch
|
|||||||
{
|
{
|
||||||
size_t size = m_source.GetSize();
|
size_t size = m_source.GetSize();
|
||||||
|
|
||||||
// which hypotheses are reachable?
|
// which hypotheses are reachable?
|
||||||
std::map<unsigned,bool> reachable;
|
std::map<unsigned,bool> reachable;
|
||||||
WordsRange fullRange(0, size-1);
|
WordsRange fullRange(0, size-1);
|
||||||
const ChartCell &lastCell = m_hypoStackColl.Get(fullRange);
|
const ChartCell &lastCell = m_hypoStackColl.Get(fullRange);
|
||||||
const ChartHypothesis *hypo = lastCell.GetBestHypothesis();
|
const ChartHypothesis *hypo = lastCell.GetBestHypothesis();
|
||||||
|
|
||||||
if (hypo == NULL) {
|
if (hypo == NULL) {
|
||||||
// no hypothesis
|
// no hypothesis
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
FindReachableHypotheses( hypo, reachable);
|
FindReachableHypotheses( hypo, reachable);
|
||||||
|
|
||||||
for (size_t width = 1; width <= size; ++width) {
|
for (size_t width = 1; width <= size; ++width) {
|
||||||
for (size_t startPos = 0; startPos <= size-width; ++startPos) {
|
for (size_t startPos = 0; startPos <= size-width; ++startPos) {
|
||||||
@ -257,42 +257,40 @@ void ChartManager::GetSearchGraph(long translationId, std::ostream &outputSearch
|
|||||||
|
|
||||||
void ChartManager::FindReachableHypotheses( const ChartHypothesis *hypo, std::map<unsigned,bool> &reachable ) const
|
void ChartManager::FindReachableHypotheses( const ChartHypothesis *hypo, std::map<unsigned,bool> &reachable ) const
|
||||||
{
|
{
|
||||||
// do not recurse, if already visited
|
// do not recurse, if already visited
|
||||||
if (reachable.find(hypo->GetId()) != reachable.end())
|
if (reachable.find(hypo->GetId()) != reachable.end()) {
|
||||||
{
|
return;
|
||||||
return;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// recurse
|
// recurse
|
||||||
reachable[ hypo->GetId() ] = true;
|
reachable[ hypo->GetId() ] = true;
|
||||||
const std::vector<const ChartHypothesis*> &previous = hypo->GetPrevHypos();
|
const std::vector<const ChartHypothesis*> &previous = hypo->GetPrevHypos();
|
||||||
for(std::vector<const ChartHypothesis*>::const_iterator i = previous.begin(); i != previous.end(); ++i)
|
for(std::vector<const ChartHypothesis*>::const_iterator i = previous.begin(); i != previous.end(); ++i) {
|
||||||
{
|
FindReachableHypotheses( *i, reachable );
|
||||||
FindReachableHypotheses( *i, reachable );
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// also loop over recombined hypotheses (arcs)
|
// also loop over recombined hypotheses (arcs)
|
||||||
const ChartArcList *arcList = hypo->GetArcList();
|
const ChartArcList *arcList = hypo->GetArcList();
|
||||||
if (arcList) {
|
if (arcList) {
|
||||||
ChartArcList::const_iterator iterArc;
|
ChartArcList::const_iterator iterArc;
|
||||||
for (iterArc = arcList->begin(); iterArc != arcList->end(); ++iterArc) {
|
for (iterArc = arcList->begin(); iterArc != arcList->end(); ++iterArc) {
|
||||||
const ChartHypothesis &arc = **iterArc;
|
const ChartHypothesis &arc = **iterArc;
|
||||||
FindReachableHypotheses( &arc, reachable );
|
FindReachableHypotheses( &arc, reachable );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ChartManager::CreateDeviantPaths(
|
void ChartManager::CreateDeviantPaths(
|
||||||
boost::shared_ptr<const ChartTrellisPath> basePath,
|
boost::shared_ptr<const ChartTrellisPath> basePath,
|
||||||
ChartTrellisDetourQueue &q)
|
ChartTrellisDetourQueue &q)
|
||||||
{
|
{
|
||||||
CreateDeviantPaths(basePath, basePath->GetFinalNode(), q);
|
CreateDeviantPaths(basePath, basePath->GetFinalNode(), q);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ChartManager::CreateDeviantPaths(
|
void ChartManager::CreateDeviantPaths(
|
||||||
boost::shared_ptr<const ChartTrellisPath> basePath,
|
boost::shared_ptr<const ChartTrellisPath> basePath,
|
||||||
const ChartTrellisNode &substitutedNode,
|
const ChartTrellisNode &substitutedNode,
|
||||||
ChartTrellisDetourQueue &queue)
|
ChartTrellisDetourQueue &queue)
|
||||||
{
|
{
|
||||||
const ChartArcList *arcList = substitutedNode.GetHypothesis().GetArcList();
|
const ChartArcList *arcList = substitutedNode.GetHypothesis().GetArcList();
|
||||||
if (arcList) {
|
if (arcList) {
|
||||||
|
@ -69,7 +69,7 @@ public:
|
|||||||
void CalcNBest(size_t count, ChartTrellisPathList &ret,bool onlyDistinct=0) const;
|
void CalcNBest(size_t count, ChartTrellisPathList &ret,bool onlyDistinct=0) const;
|
||||||
|
|
||||||
void GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream) const;
|
void GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream) const;
|
||||||
void FindReachableHypotheses( const ChartHypothesis *hypo, std::map<unsigned,bool> &reachable ) const; /* auxilliary function for GetSearchGraph */
|
void FindReachableHypotheses( const ChartHypothesis *hypo, std::map<unsigned,bool> &reachable ) const; /* auxilliary function for GetSearchGraph */
|
||||||
|
|
||||||
const InputType& GetSource() const {
|
const InputType& GetSource() const {
|
||||||
return m_source;
|
return m_source;
|
||||||
@ -89,7 +89,9 @@ public:
|
|||||||
m_sentenceStats = std::auto_ptr<SentenceStats>(new SentenceStats(source));
|
m_sentenceStats = std::auto_ptr<SentenceStats>(new SentenceStats(source));
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned GetNextHypoId() { return m_hypothesisId++; }
|
unsigned GetNextHypoId() {
|
||||||
|
return m_hypothesisId++;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -77,19 +77,19 @@ void ChartRuleLookupManagerMemory::GetChartRuleCollection(
|
|||||||
// get list of all rules that apply to spans at same starting position
|
// get list of all rules that apply to spans at same starting position
|
||||||
DottedRuleColl &dottedRuleCol = *m_dottedRuleColls[range.GetStartPos()];
|
DottedRuleColl &dottedRuleCol = *m_dottedRuleColls[range.GetStartPos()];
|
||||||
const DottedRuleList &expandableDottedRuleList = dottedRuleCol.GetExpandableDottedRuleList();
|
const DottedRuleList &expandableDottedRuleList = dottedRuleCol.GetExpandableDottedRuleList();
|
||||||
|
|
||||||
const ChartCellLabel &sourceWordLabel = GetCellCollection().Get(WordsRange(absEndPos, absEndPos)).GetSourceWordLabel();
|
const ChartCellLabel &sourceWordLabel = GetCellCollection().Get(WordsRange(absEndPos, absEndPos)).GetSourceWordLabel();
|
||||||
|
|
||||||
// loop through the rules
|
// loop through the rules
|
||||||
// (note that expandableDottedRuleList can be expanded as the loop runs
|
// (note that expandableDottedRuleList can be expanded as the loop runs
|
||||||
// through calls to ExtendPartialRuleApplication())
|
// through calls to ExtendPartialRuleApplication())
|
||||||
for (size_t ind = 0; ind < expandableDottedRuleList.size(); ++ind) {
|
for (size_t ind = 0; ind < expandableDottedRuleList.size(); ++ind) {
|
||||||
// rule we are about to extend
|
// rule we are about to extend
|
||||||
const DottedRuleInMemory &prevDottedRule = *expandableDottedRuleList[ind];
|
const DottedRuleInMemory &prevDottedRule = *expandableDottedRuleList[ind];
|
||||||
// we will now try to extend it, starting after where it ended
|
// we will now try to extend it, starting after where it ended
|
||||||
size_t startPos = prevDottedRule.IsRoot()
|
size_t startPos = prevDottedRule.IsRoot()
|
||||||
? range.GetStartPos()
|
? range.GetStartPos()
|
||||||
: prevDottedRule.GetWordsRange().GetEndPos() + 1;
|
: prevDottedRule.GetWordsRange().GetEndPos() + 1;
|
||||||
|
|
||||||
// search for terminal symbol
|
// search for terminal symbol
|
||||||
// (if only one more word position needs to be covered)
|
// (if only one more word position needs to be covered)
|
||||||
@ -102,15 +102,15 @@ void ChartRuleLookupManagerMemory::GetChartRuleCollection(
|
|||||||
|
|
||||||
// if we found a new rule -> create it and add it to the list
|
// if we found a new rule -> create it and add it to the list
|
||||||
if (node != NULL) {
|
if (node != NULL) {
|
||||||
// create the rule
|
// create the rule
|
||||||
#ifdef USE_BOOST_POOL
|
#ifdef USE_BOOST_POOL
|
||||||
DottedRuleInMemory *dottedRule = m_dottedRulePool.malloc();
|
DottedRuleInMemory *dottedRule = m_dottedRulePool.malloc();
|
||||||
new (dottedRule) DottedRuleInMemory(*node, sourceWordLabel,
|
new (dottedRule) DottedRuleInMemory(*node, sourceWordLabel,
|
||||||
prevDottedRule);
|
prevDottedRule);
|
||||||
#else
|
#else
|
||||||
DottedRuleInMemory *dottedRule = new DottedRuleInMemory(*node,
|
DottedRuleInMemory *dottedRule = new DottedRuleInMemory(*node,
|
||||||
sourceWordLabel,
|
sourceWordLabel,
|
||||||
prevDottedRule);
|
prevDottedRule);
|
||||||
#endif
|
#endif
|
||||||
dottedRuleCol.Add(relEndPos+1, dottedRule);
|
dottedRuleCol.Add(relEndPos+1, dottedRule);
|
||||||
}
|
}
|
||||||
@ -136,9 +136,7 @@ void ChartRuleLookupManagerMemory::GetChartRuleCollection(
|
|||||||
// word.
|
// word.
|
||||||
endPos = absEndPos - 1;
|
endPos = absEndPos - 1;
|
||||||
stackInd = relEndPos;
|
stackInd = relEndPos;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
endPos = absEndPos;
|
endPos = absEndPos;
|
||||||
stackInd = relEndPos + 1;
|
stackInd = relEndPos + 1;
|
||||||
}
|
}
|
||||||
@ -215,7 +213,7 @@ void ChartRuleLookupManagerMemory::ExtendPartialRuleApplication(
|
|||||||
// We'll do whichever minimises the number of lookups:
|
// We'll do whichever minimises the number of lookups:
|
||||||
if (numCombinations <= numChildren*2) {
|
if (numCombinations <= numChildren*2) {
|
||||||
|
|
||||||
// loop over possible source non-terminal labels (as found in input tree)
|
// loop over possible source non-terminal labels (as found in input tree)
|
||||||
NonTerminalSet::const_iterator p = sourceNonTerms.begin();
|
NonTerminalSet::const_iterator p = sourceNonTerms.begin();
|
||||||
NonTerminalSet::const_iterator sEnd = sourceNonTerms.end();
|
NonTerminalSet::const_iterator sEnd = sourceNonTerms.end();
|
||||||
for (; p != sEnd; ++p) {
|
for (; p != sEnd; ++p) {
|
||||||
@ -242,14 +240,12 @@ void ChartRuleLookupManagerMemory::ExtendPartialRuleApplication(
|
|||||||
new (rule) DottedRuleInMemory(*child, cellLabel, prevDottedRule);
|
new (rule) DottedRuleInMemory(*child, cellLabel, prevDottedRule);
|
||||||
#else
|
#else
|
||||||
DottedRuleInMemory *rule = new DottedRuleInMemory(*child, cellLabel,
|
DottedRuleInMemory *rule = new DottedRuleInMemory(*child, cellLabel,
|
||||||
prevDottedRule);
|
prevDottedRule);
|
||||||
#endif
|
#endif
|
||||||
dottedRuleColl.Add(stackInd, rule);
|
dottedRuleColl.Add(stackInd, rule);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
// loop over possible expansions of the rule
|
// loop over possible expansions of the rule
|
||||||
PhraseDictionaryNodeSCFG::NonTerminalMap::const_iterator p;
|
PhraseDictionaryNodeSCFG::NonTerminalMap::const_iterator p;
|
||||||
PhraseDictionaryNodeSCFG::NonTerminalMap::const_iterator end =
|
PhraseDictionaryNodeSCFG::NonTerminalMap::const_iterator end =
|
||||||
@ -274,7 +270,7 @@ void ChartRuleLookupManagerMemory::ExtendPartialRuleApplication(
|
|||||||
new (rule) DottedRuleInMemory(child, *cellLabel, prevDottedRule);
|
new (rule) DottedRuleInMemory(child, *cellLabel, prevDottedRule);
|
||||||
#else
|
#else
|
||||||
DottedRuleInMemory *rule = new DottedRuleInMemory(child, *cellLabel,
|
DottedRuleInMemory *rule = new DottedRuleInMemory(child, *cellLabel,
|
||||||
prevDottedRule);
|
prevDottedRule);
|
||||||
#endif
|
#endif
|
||||||
dottedRuleColl.Add(stackInd, rule);
|
dottedRuleColl.Add(stackInd, rule);
|
||||||
}
|
}
|
||||||
|
@ -30,7 +30,7 @@ namespace Moses
|
|||||||
{
|
{
|
||||||
|
|
||||||
void ChartTranslationOption::CalcEstimateOfBestScore(
|
void ChartTranslationOption::CalcEstimateOfBestScore(
|
||||||
const ChartCellCollection &allChartCells)
|
const ChartCellCollection &allChartCells)
|
||||||
{
|
{
|
||||||
const TargetPhrase &targetPhrase = **(m_targetPhraseCollection.begin());
|
const TargetPhrase &targetPhrase = **(m_targetPhraseCollection.begin());
|
||||||
m_estimateOfBestScore = targetPhrase.GetFutureScore();
|
m_estimateOfBestScore = targetPhrase.GetFutureScore();
|
||||||
|
@ -37,7 +37,7 @@ class ChartCellCollection;
|
|||||||
// of translations and provdes an estimate of the best score.
|
// of translations and provdes an estimate of the best score.
|
||||||
class ChartTranslationOption
|
class ChartTranslationOption
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
ChartTranslationOption(const TargetPhraseCollection &targetPhraseColl,
|
ChartTranslationOption(const TargetPhraseCollection &targetPhraseColl,
|
||||||
const DottedRule &dottedRule,
|
const DottedRule &dottedRule,
|
||||||
const WordsRange &wordsRange,
|
const WordsRange &wordsRange,
|
||||||
@ -45,16 +45,17 @@ class ChartTranslationOption
|
|||||||
: m_dottedRule(dottedRule)
|
: m_dottedRule(dottedRule)
|
||||||
, m_targetPhraseCollection(targetPhraseColl)
|
, m_targetPhraseCollection(targetPhraseColl)
|
||||||
, m_wordsRange(wordsRange)
|
, m_wordsRange(wordsRange)
|
||||||
, m_estimateOfBestScore(0)
|
, m_estimateOfBestScore(0) {
|
||||||
{
|
|
||||||
CalcEstimateOfBestScore(allChartCells);
|
CalcEstimateOfBestScore(allChartCells);
|
||||||
}
|
}
|
||||||
|
|
||||||
~ChartTranslationOption() {}
|
~ChartTranslationOption() {}
|
||||||
|
|
||||||
const DottedRule &GetDottedRule() const { return m_dottedRule; }
|
const DottedRule &GetDottedRule() const {
|
||||||
|
return m_dottedRule;
|
||||||
|
}
|
||||||
|
|
||||||
const TargetPhraseCollection &GetTargetPhraseCollection() const {
|
const TargetPhraseCollection &GetTargetPhraseCollection() const {
|
||||||
return m_targetPhraseCollection;
|
return m_targetPhraseCollection;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -65,9 +66,11 @@ class ChartTranslationOption
|
|||||||
// return an estimate of the best score possible with this translation option.
|
// return an estimate of the best score possible with this translation option.
|
||||||
// the estimate is the sum of the top target phrase's estimated score plus the
|
// the estimate is the sum of the top target phrase's estimated score plus the
|
||||||
// scores of the best child hypotheses.
|
// scores of the best child hypotheses.
|
||||||
inline float GetEstimateOfBestScore() const { return m_estimateOfBestScore; }
|
inline float GetEstimateOfBestScore() const {
|
||||||
|
return m_estimateOfBestScore;
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// not implemented
|
// not implemented
|
||||||
ChartTranslationOption &operator=(const ChartTranslationOption &);
|
ChartTranslationOption &operator=(const ChartTranslationOption &);
|
||||||
|
|
||||||
|
@ -106,8 +106,8 @@ void ChartTranslationOptionCollection::ProcessUnknownWord(size_t startPos, size_
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (startPos == 0 || startPos == m_source.GetSize() - 1)
|
if (startPos == 0 || startPos == m_source.GetSize() - 1) {
|
||||||
{ // don't create unknown words for <S> or </S> tags. Otherwise they can be moved. Should only be translated by glue rules
|
// don't create unknown words for <S> or </S> tags. Otherwise they can be moved. Should only be translated by glue rules
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -74,9 +74,9 @@ protected:
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
ChartTranslationOptionCollection(InputType const& source
|
ChartTranslationOptionCollection(InputType const& source
|
||||||
, const TranslationSystem* system
|
, const TranslationSystem* system
|
||||||
, const ChartCellCollection &hypoStackColl
|
, const ChartCellCollection &hypoStackColl
|
||||||
, const std::vector<ChartRuleLookupManager*> &ruleLookupManagers);
|
, const std::vector<ChartRuleLookupManager*> &ruleLookupManagers);
|
||||||
virtual ~ChartTranslationOptionCollection();
|
virtual ~ChartTranslationOptionCollection();
|
||||||
void CreateTranslationOptionsForRange(size_t startPos
|
void CreateTranslationOptionsForRange(size_t startPos
|
||||||
, size_t endPos);
|
, size_t endPos);
|
||||||
|
@ -66,12 +66,11 @@ void ChartTranslationOptionList::Add(const TargetPhraseCollection &targetPhraseC
|
|||||||
if (m_collection.size() < ruleLimit) {
|
if (m_collection.size() < ruleLimit) {
|
||||||
// not yet filled out quota. add everything
|
// not yet filled out quota. add everything
|
||||||
ChartTranslationOption *option = new ChartTranslationOption(
|
ChartTranslationOption *option = new ChartTranslationOption(
|
||||||
targetPhraseCollection, dottedRule, m_range, chartCellColl);
|
targetPhraseCollection, dottedRule, m_range, chartCellColl);
|
||||||
m_collection.push_back(option);
|
m_collection.push_back(option);
|
||||||
float score = option->GetEstimateOfBestScore();
|
float score = option->GetEstimateOfBestScore();
|
||||||
m_scoreThreshold = (score < m_scoreThreshold) ? score : m_scoreThreshold;
|
m_scoreThreshold = (score < m_scoreThreshold) ? score : m_scoreThreshold;
|
||||||
}
|
} else {
|
||||||
else {
|
|
||||||
// full but not bursting. add if better than worst score
|
// full but not bursting. add if better than worst score
|
||||||
ChartTranslationOption option(targetPhraseCollection, dottedRule,
|
ChartTranslationOption option(targetPhraseCollection, dottedRule,
|
||||||
m_range, chartCellColl);
|
m_range, chartCellColl);
|
||||||
|
@ -1,17 +1,17 @@
|
|||||||
/***********************************************************************
|
/***********************************************************************
|
||||||
Moses - statistical machine translation system
|
Moses - statistical machine translation system
|
||||||
Copyright (C) 2006-2011 University of Edinburgh
|
Copyright (C) 2006-2011 University of Edinburgh
|
||||||
|
|
||||||
This library is free software; you can redistribute it and/or
|
This library is free software; you can redistribute it and/or
|
||||||
modify it under the terms of the GNU Lesser General Public
|
modify it under the terms of the GNU Lesser General Public
|
||||||
License as published by the Free Software Foundation; either
|
License as published by the Free Software Foundation; either
|
||||||
version 2.1 of the License, or (at your option) any later version.
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
This library is distributed in the hope that it will be useful,
|
This library is distributed in the hope that it will be useful,
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
Lesser General Public License for more details.
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
You should have received a copy of the GNU Lesser General Public
|
You should have received a copy of the GNU Lesser General Public
|
||||||
License along with this library; if not, write to the Free Software
|
License along with this library; if not, write to the Free Software
|
||||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
@ -27,15 +27,15 @@ namespace Moses
|
|||||||
{
|
{
|
||||||
|
|
||||||
ChartTrellisDetour::ChartTrellisDetour(
|
ChartTrellisDetour::ChartTrellisDetour(
|
||||||
boost::shared_ptr<const ChartTrellisPath> basePath,
|
boost::shared_ptr<const ChartTrellisPath> basePath,
|
||||||
const ChartTrellisNode &substitutedNode,
|
const ChartTrellisNode &substitutedNode,
|
||||||
const ChartHypothesis &replacementHypo)
|
const ChartHypothesis &replacementHypo)
|
||||||
: m_basePath(basePath)
|
: m_basePath(basePath)
|
||||||
, m_substitutedNode(substitutedNode)
|
, m_substitutedNode(substitutedNode)
|
||||||
, m_replacementHypo(replacementHypo)
|
, m_replacementHypo(replacementHypo)
|
||||||
{
|
{
|
||||||
float diff = replacementHypo.GetTotalScore()
|
float diff = replacementHypo.GetTotalScore()
|
||||||
- substitutedNode.GetHypothesis().GetTotalScore();
|
- substitutedNode.GetHypothesis().GetTotalScore();
|
||||||
m_totalScore = basePath->GetTotalScore() + diff;
|
m_totalScore = basePath->GetTotalScore() + diff;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,17 +1,17 @@
|
|||||||
/***********************************************************************
|
/***********************************************************************
|
||||||
Moses - statistical machine translation system
|
Moses - statistical machine translation system
|
||||||
Copyright (C) 2006-2011 University of Edinburgh
|
Copyright (C) 2006-2011 University of Edinburgh
|
||||||
|
|
||||||
This library is free software; you can redistribute it and/or
|
This library is free software; you can redistribute it and/or
|
||||||
modify it under the terms of the GNU Lesser General Public
|
modify it under the terms of the GNU Lesser General Public
|
||||||
License as published by the Free Software Foundation; either
|
License as published by the Free Software Foundation; either
|
||||||
version 2.1 of the License, or (at your option) any later version.
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
This library is distributed in the hope that it will be useful,
|
This library is distributed in the hope that it will be useful,
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
Lesser General Public License for more details.
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
You should have received a copy of the GNU Lesser General Public
|
You should have received a copy of the GNU Lesser General Public
|
||||||
License along with this library; if not, write to the Free Software
|
License along with this library; if not, write to the Free Software
|
||||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
@ -29,20 +29,24 @@ class ChartTrellisPath;
|
|||||||
|
|
||||||
class ChartTrellisDetour
|
class ChartTrellisDetour
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
ChartTrellisDetour(boost::shared_ptr<const ChartTrellisPath>,
|
ChartTrellisDetour(boost::shared_ptr<const ChartTrellisPath>,
|
||||||
const ChartTrellisNode &, const ChartHypothesis &);
|
const ChartTrellisNode &, const ChartHypothesis &);
|
||||||
|
|
||||||
const ChartTrellisPath &GetBasePath() const { return *m_basePath; }
|
const ChartTrellisPath &GetBasePath() const {
|
||||||
|
return *m_basePath;
|
||||||
|
}
|
||||||
const ChartTrellisNode &GetSubstitutedNode() const {
|
const ChartTrellisNode &GetSubstitutedNode() const {
|
||||||
return m_substitutedNode;
|
return m_substitutedNode;
|
||||||
}
|
}
|
||||||
const ChartHypothesis &GetReplacementHypo() const {
|
const ChartHypothesis &GetReplacementHypo() const {
|
||||||
return m_replacementHypo;
|
return m_replacementHypo;
|
||||||
}
|
}
|
||||||
float GetTotalScore() const { return m_totalScore; }
|
float GetTotalScore() const {
|
||||||
|
return m_totalScore;
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
boost::shared_ptr<const ChartTrellisPath> m_basePath;
|
boost::shared_ptr<const ChartTrellisPath> m_basePath;
|
||||||
const ChartTrellisNode &m_substitutedNode;
|
const ChartTrellisNode &m_substitutedNode;
|
||||||
const ChartHypothesis &m_replacementHypo;
|
const ChartHypothesis &m_replacementHypo;
|
||||||
|
@ -1,17 +1,17 @@
|
|||||||
/***********************************************************************
|
/***********************************************************************
|
||||||
Moses - statistical machine translation system
|
Moses - statistical machine translation system
|
||||||
Copyright (C) 2006-2011 University of Edinburgh
|
Copyright (C) 2006-2011 University of Edinburgh
|
||||||
|
|
||||||
This library is free software; you can redistribute it and/or
|
This library is free software; you can redistribute it and/or
|
||||||
modify it under the terms of the GNU Lesser General Public
|
modify it under the terms of the GNU Lesser General Public
|
||||||
License as published by the Free Software Foundation; either
|
License as published by the Free Software Foundation; either
|
||||||
version 2.1 of the License, or (at your option) any later version.
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
This library is distributed in the hope that it will be useful,
|
This library is distributed in the hope that it will be useful,
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
Lesser General Public License for more details.
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
You should have received a copy of the GNU Lesser General Public
|
You should have received a copy of the GNU Lesser General Public
|
||||||
License along with this library; if not, write to the Free Software
|
License along with this library; if not, write to the Free Software
|
||||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
@ -21,13 +21,16 @@
|
|||||||
|
|
||||||
#include "Util.h"
|
#include "Util.h"
|
||||||
|
|
||||||
namespace Moses {
|
namespace Moses
|
||||||
|
{
|
||||||
|
|
||||||
ChartTrellisDetourQueue::~ChartTrellisDetourQueue() {
|
ChartTrellisDetourQueue::~ChartTrellisDetourQueue()
|
||||||
|
{
|
||||||
RemoveAllInColl(m_queue);
|
RemoveAllInColl(m_queue);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ChartTrellisDetourQueue::Push(const ChartTrellisDetour *detour) {
|
void ChartTrellisDetourQueue::Push(const ChartTrellisDetour *detour)
|
||||||
|
{
|
||||||
if (m_capacity == 0 || m_queue.size() < m_capacity) {
|
if (m_capacity == 0 || m_queue.size() < m_capacity) {
|
||||||
m_queue.insert(detour);
|
m_queue.insert(detour);
|
||||||
} else if (detour->GetTotalScore() > (*m_queue.rbegin())->GetTotalScore()) {
|
} else if (detour->GetTotalScore() > (*m_queue.rbegin())->GetTotalScore()) {
|
||||||
@ -43,7 +46,8 @@ void ChartTrellisDetourQueue::Push(const ChartTrellisDetour *detour) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const ChartTrellisDetour *ChartTrellisDetourQueue::Pop() {
|
const ChartTrellisDetour *ChartTrellisDetourQueue::Pop()
|
||||||
|
{
|
||||||
QueueType::iterator p = m_queue.begin();
|
QueueType::iterator p = m_queue.begin();
|
||||||
const ChartTrellisDetour *top = *p;
|
const ChartTrellisDetour *top = *p;
|
||||||
m_queue.erase(p);
|
m_queue.erase(p);
|
||||||
|
@ -1,17 +1,17 @@
|
|||||||
/***********************************************************************
|
/***********************************************************************
|
||||||
Moses - statistical machine translation system
|
Moses - statistical machine translation system
|
||||||
Copyright (C) 2006-2011 University of Edinburgh
|
Copyright (C) 2006-2011 University of Edinburgh
|
||||||
|
|
||||||
This library is free software; you can redistribute it and/or
|
This library is free software; you can redistribute it and/or
|
||||||
modify it under the terms of the GNU Lesser General Public
|
modify it under the terms of the GNU Lesser General Public
|
||||||
License as published by the Free Software Foundation; either
|
License as published by the Free Software Foundation; either
|
||||||
version 2.1 of the License, or (at your option) any later version.
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
This library is distributed in the hope that it will be useful,
|
This library is distributed in the hope that it will be useful,
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
Lesser General Public License for more details.
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
You should have received a copy of the GNU Lesser General Public
|
You should have received a copy of the GNU Lesser General Public
|
||||||
License along with this library; if not, write to the Free Software
|
License along with this library; if not, write to the Free Software
|
||||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
@ -23,19 +23,23 @@
|
|||||||
|
|
||||||
#include <set>
|
#include <set>
|
||||||
|
|
||||||
namespace Moses {
|
namespace Moses
|
||||||
|
{
|
||||||
|
|
||||||
// A bounded priority queue of ChartTrellisDetour pointers. The top item is
|
// A bounded priority queue of ChartTrellisDetour pointers. The top item is
|
||||||
// the best scoring detour. The queue assumes ownership of pushed items and
|
// the best scoring detour. The queue assumes ownership of pushed items and
|
||||||
// relinquishes ownership when they are popped. Any remaining items at the
|
// relinquishes ownership when they are popped. Any remaining items at the
|
||||||
// time of the queue's destruction are deleted.
|
// time of the queue's destruction are deleted.
|
||||||
class ChartTrellisDetourQueue {
|
class ChartTrellisDetourQueue
|
||||||
public:
|
{
|
||||||
|
public:
|
||||||
// Create empty queue with fixed capacity of c. Capacity 0 means unbounded.
|
// Create empty queue with fixed capacity of c. Capacity 0 means unbounded.
|
||||||
ChartTrellisDetourQueue(size_t c) : m_capacity(c) {}
|
ChartTrellisDetourQueue(size_t c) : m_capacity(c) {}
|
||||||
~ChartTrellisDetourQueue();
|
~ChartTrellisDetourQueue();
|
||||||
|
|
||||||
bool Empty() const { return m_queue.empty(); }
|
bool Empty() const {
|
||||||
|
return m_queue.empty();
|
||||||
|
}
|
||||||
|
|
||||||
// Add the detour to the queue or delete it if the queue is full and the
|
// Add the detour to the queue or delete it if the queue is full and the
|
||||||
// score is no better than the queue's worst score.
|
// score is no better than the queue's worst score.
|
||||||
@ -45,7 +49,7 @@ class ChartTrellisDetourQueue {
|
|||||||
// caller is responsible for deleting the object.
|
// caller is responsible for deleting the object.
|
||||||
const ChartTrellisDetour *Pop();
|
const ChartTrellisDetour *Pop();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
struct DetourOrderer {
|
struct DetourOrderer {
|
||||||
bool operator()(const ChartTrellisDetour* a,
|
bool operator()(const ChartTrellisDetour* a,
|
||||||
const ChartTrellisDetour* b) const {
|
const ChartTrellisDetour* b) const {
|
||||||
|
@ -31,16 +31,16 @@ namespace Moses
|
|||||||
{
|
{
|
||||||
|
|
||||||
ChartTrellisNode::ChartTrellisNode(const ChartHypothesis &hypo)
|
ChartTrellisNode::ChartTrellisNode(const ChartHypothesis &hypo)
|
||||||
: m_hypo(hypo)
|
: m_hypo(hypo)
|
||||||
{
|
{
|
||||||
CreateChildren();
|
CreateChildren();
|
||||||
}
|
}
|
||||||
|
|
||||||
ChartTrellisNode::ChartTrellisNode(const ChartTrellisDetour &detour,
|
ChartTrellisNode::ChartTrellisNode(const ChartTrellisDetour &detour,
|
||||||
ChartTrellisNode *&deviationPoint)
|
ChartTrellisNode *&deviationPoint)
|
||||||
: m_hypo((&detour.GetBasePath().GetFinalNode() == &detour.GetSubstitutedNode())
|
: m_hypo((&detour.GetBasePath().GetFinalNode() == &detour.GetSubstitutedNode())
|
||||||
? detour.GetReplacementHypo()
|
? detour.GetReplacementHypo()
|
||||||
: detour.GetBasePath().GetFinalNode().GetHypothesis())
|
: detour.GetBasePath().GetFinalNode().GetHypothesis())
|
||||||
{
|
{
|
||||||
if (&m_hypo == &detour.GetReplacementHypo()) {
|
if (&m_hypo == &detour.GetReplacementHypo()) {
|
||||||
deviationPoint = this;
|
deviationPoint = this;
|
||||||
@ -56,9 +56,9 @@ ChartTrellisNode::ChartTrellisNode(const ChartTrellisNode &root,
|
|||||||
const ChartTrellisNode &substitutedNode,
|
const ChartTrellisNode &substitutedNode,
|
||||||
const ChartHypothesis &replacementHypo,
|
const ChartHypothesis &replacementHypo,
|
||||||
ChartTrellisNode *&deviationPoint)
|
ChartTrellisNode *&deviationPoint)
|
||||||
: m_hypo((&root == &substitutedNode)
|
: m_hypo((&root == &substitutedNode)
|
||||||
? replacementHypo
|
? replacementHypo
|
||||||
: root.GetHypothesis())
|
: root.GetHypothesis())
|
||||||
{
|
{
|
||||||
if (&root == &substitutedNode) {
|
if (&root == &substitutedNode) {
|
||||||
deviationPoint = this;
|
deviationPoint = this;
|
||||||
@ -124,8 +124,8 @@ void ChartTrellisNode::CreateChildren(const ChartTrellisNode &rootNode,
|
|||||||
for (size_t ind = 0; ind < children.size(); ++ind) {
|
for (size_t ind = 0; ind < children.size(); ++ind) {
|
||||||
const ChartTrellisNode *origChild = children[ind];
|
const ChartTrellisNode *origChild = children[ind];
|
||||||
ChartTrellisNode *child = new ChartTrellisNode(*origChild, substitutedNode,
|
ChartTrellisNode *child = new ChartTrellisNode(*origChild, substitutedNode,
|
||||||
replacementHypo,
|
replacementHypo,
|
||||||
deviationPoint);
|
deviationPoint);
|
||||||
m_children.push_back(child);
|
m_children.push_back(child);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -32,7 +32,7 @@ class ChartTrellisDetour;
|
|||||||
|
|
||||||
class ChartTrellisNode
|
class ChartTrellisNode
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
typedef std::vector<ChartTrellisNode*> NodeChildren;
|
typedef std::vector<ChartTrellisNode*> NodeChildren;
|
||||||
|
|
||||||
ChartTrellisNode(const ChartHypothesis &hypo);
|
ChartTrellisNode(const ChartHypothesis &hypo);
|
||||||
@ -40,15 +40,21 @@ class ChartTrellisNode
|
|||||||
|
|
||||||
~ChartTrellisNode();
|
~ChartTrellisNode();
|
||||||
|
|
||||||
const ChartHypothesis &GetHypothesis() const { return m_hypo; }
|
const ChartHypothesis &GetHypothesis() const {
|
||||||
|
return m_hypo;
|
||||||
|
}
|
||||||
|
|
||||||
const NodeChildren &GetChildren() const { return m_children; }
|
const NodeChildren &GetChildren() const {
|
||||||
|
return m_children;
|
||||||
|
}
|
||||||
|
|
||||||
const ChartTrellisNode &GetChild(size_t i) const { return *m_children[i]; }
|
const ChartTrellisNode &GetChild(size_t i) const {
|
||||||
|
return *m_children[i];
|
||||||
|
}
|
||||||
|
|
||||||
Phrase GetOutputPhrase() const;
|
Phrase GetOutputPhrase() const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
ChartTrellisNode(const ChartTrellisNode &); // Not implemented
|
ChartTrellisNode(const ChartTrellisNode &); // Not implemented
|
||||||
ChartTrellisNode& operator=(const ChartTrellisNode &); // Not implemented
|
ChartTrellisNode& operator=(const ChartTrellisNode &); // Not implemented
|
||||||
|
|
||||||
|
@ -30,17 +30,17 @@ namespace Moses
|
|||||||
{
|
{
|
||||||
|
|
||||||
ChartTrellisPath::ChartTrellisPath(const ChartHypothesis &hypo)
|
ChartTrellisPath::ChartTrellisPath(const ChartHypothesis &hypo)
|
||||||
: m_finalNode(new ChartTrellisNode(hypo))
|
: m_finalNode(new ChartTrellisNode(hypo))
|
||||||
, m_deviationPoint(NULL)
|
, m_deviationPoint(NULL)
|
||||||
, m_scoreBreakdown(hypo.GetScoreBreakdown())
|
, m_scoreBreakdown(hypo.GetScoreBreakdown())
|
||||||
, m_totalScore(hypo.GetTotalScore())
|
, m_totalScore(hypo.GetTotalScore())
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
ChartTrellisPath::ChartTrellisPath(const ChartTrellisDetour &detour)
|
ChartTrellisPath::ChartTrellisPath(const ChartTrellisDetour &detour)
|
||||||
: m_finalNode(new ChartTrellisNode(detour, m_deviationPoint))
|
: m_finalNode(new ChartTrellisNode(detour, m_deviationPoint))
|
||||||
, m_scoreBreakdown(detour.GetBasePath().m_scoreBreakdown)
|
, m_scoreBreakdown(detour.GetBasePath().m_scoreBreakdown)
|
||||||
, m_totalScore(0)
|
, m_totalScore(0)
|
||||||
{
|
{
|
||||||
CHECK(m_deviationPoint);
|
CHECK(m_deviationPoint);
|
||||||
ScoreComponentCollection scoreChange;
|
ScoreComponentCollection scoreChange;
|
||||||
|
@ -36,18 +36,24 @@ class ChartTrellisNode;
|
|||||||
|
|
||||||
class ChartTrellisPath
|
class ChartTrellisPath
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
ChartTrellisPath(const ChartHypothesis &hypo);
|
ChartTrellisPath(const ChartHypothesis &hypo);
|
||||||
ChartTrellisPath(const ChartTrellisDetour &detour);
|
ChartTrellisPath(const ChartTrellisDetour &detour);
|
||||||
|
|
||||||
~ChartTrellisPath();
|
~ChartTrellisPath();
|
||||||
|
|
||||||
const ChartTrellisNode &GetFinalNode() const { return *m_finalNode; }
|
const ChartTrellisNode &GetFinalNode() const {
|
||||||
|
return *m_finalNode;
|
||||||
|
}
|
||||||
|
|
||||||
const ChartTrellisNode *GetDeviationPoint() const { return m_deviationPoint; }
|
const ChartTrellisNode *GetDeviationPoint() const {
|
||||||
|
return m_deviationPoint;
|
||||||
|
}
|
||||||
|
|
||||||
//! get score for this path throught trellis
|
//! get score for this path throught trellis
|
||||||
float GetTotalScore() const { return m_totalScore; }
|
float GetTotalScore() const {
|
||||||
|
return m_totalScore;
|
||||||
|
}
|
||||||
|
|
||||||
Phrase GetOutputPhrase() const;
|
Phrase GetOutputPhrase() const;
|
||||||
|
|
||||||
@ -56,7 +62,7 @@ class ChartTrellisPath
|
|||||||
return m_scoreBreakdown;
|
return m_scoreBreakdown;
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
ChartTrellisPath(const ChartTrellisPath &); // Not implemented
|
ChartTrellisPath(const ChartTrellisPath &); // Not implemented
|
||||||
ChartTrellisPath &operator=(const ChartTrellisPath &); // Not implemented
|
ChartTrellisPath &operator=(const ChartTrellisPath &); // Not implemented
|
||||||
|
|
||||||
|
@ -32,26 +32,38 @@ class DottedRule
|
|||||||
{
|
{
|
||||||
friend std::ostream& operator<<(std::ostream &, const DottedRule &);
|
friend std::ostream& operator<<(std::ostream &, const DottedRule &);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
// used only to init dot stack.
|
// used only to init dot stack.
|
||||||
DottedRule()
|
DottedRule()
|
||||||
: m_cellLabel(NULL)
|
: m_cellLabel(NULL)
|
||||||
, m_prev(NULL) {}
|
, m_prev(NULL) {}
|
||||||
|
|
||||||
DottedRule(const ChartCellLabel &ccl, const DottedRule &prev)
|
DottedRule(const ChartCellLabel &ccl, const DottedRule &prev)
|
||||||
: m_cellLabel(&ccl)
|
: m_cellLabel(&ccl)
|
||||||
, m_prev(&prev) {}
|
, m_prev(&prev) {}
|
||||||
|
|
||||||
const WordsRange &GetWordsRange() const { return m_cellLabel->GetCoverage(); }
|
const WordsRange &GetWordsRange() const {
|
||||||
const Word &GetSourceWord() const { return m_cellLabel->GetLabel(); }
|
return m_cellLabel->GetCoverage();
|
||||||
bool IsNonTerminal() const { return m_cellLabel->GetLabel().IsNonTerminal(); }
|
}
|
||||||
const DottedRule *GetPrev() const { return m_prev; }
|
const Word &GetSourceWord() const {
|
||||||
bool IsRoot() const { return m_prev == NULL; }
|
return m_cellLabel->GetLabel();
|
||||||
const ChartCellLabel &GetChartCellLabel() const { return *m_cellLabel; }
|
}
|
||||||
|
bool IsNonTerminal() const {
|
||||||
|
return m_cellLabel->GetLabel().IsNonTerminal();
|
||||||
|
}
|
||||||
|
const DottedRule *GetPrev() const {
|
||||||
|
return m_prev;
|
||||||
|
}
|
||||||
|
bool IsRoot() const {
|
||||||
|
return m_prev == NULL;
|
||||||
|
}
|
||||||
|
const ChartCellLabel &GetChartCellLabel() const {
|
||||||
|
return *m_cellLabel;
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
const ChartCellLabel *m_cellLabel; // usually contains something, unless
|
const ChartCellLabel *m_cellLabel; // usually contains something, unless
|
||||||
// it's the init processed rule
|
// it's the init processed rule
|
||||||
const DottedRule *m_prev;
|
const DottedRule *m_prev;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1,17 +1,17 @@
|
|||||||
/***********************************************************************
|
/***********************************************************************
|
||||||
Moses - statistical machine translation system
|
Moses - statistical machine translation system
|
||||||
Copyright (C) 2006-2011 University of Edinburgh
|
Copyright (C) 2006-2011 University of Edinburgh
|
||||||
|
|
||||||
This library is free software; you can redistribute it and/or
|
This library is free software; you can redistribute it and/or
|
||||||
modify it under the terms of the GNU Lesser General Public
|
modify it under the terms of the GNU Lesser General Public
|
||||||
License as published by the Free Software Foundation; either
|
License as published by the Free Software Foundation; either
|
||||||
version 2.1 of the License, or (at your option) any later version.
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
This library is distributed in the hope that it will be useful,
|
This library is distributed in the hope that it will be useful,
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
Lesser General Public License for more details.
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
You should have received a copy of the GNU Lesser General Public
|
You should have received a copy of the GNU Lesser General Public
|
||||||
License along with this library; if not, write to the Free Software
|
License along with this library; if not, write to the Free Software
|
||||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
@ -34,21 +34,23 @@ namespace Moses
|
|||||||
|
|
||||||
class DottedRuleInMemory : public DottedRule
|
class DottedRuleInMemory : public DottedRule
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
// used only to init dot stack.
|
// used only to init dot stack.
|
||||||
explicit DottedRuleInMemory(const PhraseDictionaryNodeSCFG &node)
|
explicit DottedRuleInMemory(const PhraseDictionaryNodeSCFG &node)
|
||||||
: DottedRule()
|
: DottedRule()
|
||||||
, m_node(node) {}
|
, m_node(node) {}
|
||||||
|
|
||||||
DottedRuleInMemory(const PhraseDictionaryNodeSCFG &node,
|
DottedRuleInMemory(const PhraseDictionaryNodeSCFG &node,
|
||||||
const ChartCellLabel &cellLabel,
|
const ChartCellLabel &cellLabel,
|
||||||
const DottedRuleInMemory &prev)
|
const DottedRuleInMemory &prev)
|
||||||
: DottedRule(cellLabel, prev)
|
: DottedRule(cellLabel, prev)
|
||||||
, m_node(node) {}
|
, m_node(node) {}
|
||||||
|
|
||||||
const PhraseDictionaryNodeSCFG &GetLastNode() const { return m_node; }
|
|
||||||
|
|
||||||
private:
|
const PhraseDictionaryNodeSCFG &GetLastNode() const {
|
||||||
|
return m_node;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
const PhraseDictionaryNodeSCFG &m_node;
|
const PhraseDictionaryNodeSCFG &m_node;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -34,26 +34,32 @@ namespace Moses
|
|||||||
{
|
{
|
||||||
class DottedRuleOnDisk : public DottedRule
|
class DottedRuleOnDisk : public DottedRule
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
// used only to init dot stack.
|
// used only to init dot stack.
|
||||||
explicit DottedRuleOnDisk(const OnDiskPt::PhraseNode &lastNode)
|
explicit DottedRuleOnDisk(const OnDiskPt::PhraseNode &lastNode)
|
||||||
: DottedRule()
|
: DottedRule()
|
||||||
, m_lastNode(lastNode)
|
, m_lastNode(lastNode)
|
||||||
, m_done(false) {}
|
, m_done(false) {}
|
||||||
|
|
||||||
DottedRuleOnDisk(const OnDiskPt::PhraseNode &lastNode,
|
DottedRuleOnDisk(const OnDiskPt::PhraseNode &lastNode,
|
||||||
const ChartCellLabel &cellLabel,
|
const ChartCellLabel &cellLabel,
|
||||||
const DottedRuleOnDisk &prev)
|
const DottedRuleOnDisk &prev)
|
||||||
: DottedRule(cellLabel, prev)
|
: DottedRule(cellLabel, prev)
|
||||||
, m_lastNode(lastNode)
|
, m_lastNode(lastNode)
|
||||||
, m_done(false) {}
|
, m_done(false) {}
|
||||||
|
|
||||||
const OnDiskPt::PhraseNode &GetLastNode() const { return m_lastNode; }
|
const OnDiskPt::PhraseNode &GetLastNode() const {
|
||||||
|
return m_lastNode;
|
||||||
|
}
|
||||||
|
|
||||||
bool Done() const { return m_done; }
|
bool Done() const {
|
||||||
void Done(bool value) const { m_done = value; }
|
return m_done;
|
||||||
|
}
|
||||||
|
void Done(bool value) const {
|
||||||
|
m_done = value;
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
const OnDiskPt::PhraseNode &m_lastNode;
|
const OnDiskPt::PhraseNode &m_lastNode;
|
||||||
mutable bool m_done;
|
mutable bool m_done;
|
||||||
};
|
};
|
||||||
|
@ -36,9 +36,9 @@ public:
|
|||||||
const ChartHypothesis&,
|
const ChartHypothesis&,
|
||||||
int /* featureID */,
|
int /* featureID */,
|
||||||
ScoreComponentCollection*) const {
|
ScoreComponentCollection*) const {
|
||||||
CHECK(0); // feature function not valid in chart decoder
|
CHECK(0); // feature function not valid in chart decoder
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
/** Doesn't do anything but provide a key into the global
|
/** Doesn't do anything but provide a key into the global
|
||||||
|
@ -22,176 +22,179 @@
|
|||||||
#include <ctime>
|
#include <ctime>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
namespace randlm {
|
namespace randlm
|
||||||
|
{
|
||||||
template<typename T>
|
|
||||||
class CacheNode {
|
|
||||||
public:
|
|
||||||
typedef std::map<wordID_t, CacheNode<T>* > childMap;
|
|
||||||
// initialise value to 'unknown' (i.e. not yet queried or cached).
|
|
||||||
CacheNode(T unknown_value) : value_(unknown_value) {}
|
|
||||||
childMap childs_; // child pointers
|
|
||||||
T value_; // value stored
|
|
||||||
const void* state_; // state pointer
|
|
||||||
};
|
|
||||||
|
|
||||||
template<typename T>
|
|
||||||
class Cache {
|
|
||||||
public:
|
|
||||||
typedef typename std::map<wordID_t, CacheNode<T>* >::iterator childPtr;
|
|
||||||
// unknown_value is used to indicate the ngram was not queried (yet)
|
|
||||||
// null_value_ indicates it was queried but not found in model
|
|
||||||
// space usage is handled by client.
|
|
||||||
Cache(T unknown_value, T null_value) :
|
|
||||||
cur_nodes_(0), unknown_value_(unknown_value), null_value_(null_value) {
|
|
||||||
root_ = newNode();
|
|
||||||
}
|
|
||||||
~Cache() {
|
|
||||||
if(clear()) {
|
|
||||||
delete root_;
|
|
||||||
root_ = NULL;
|
|
||||||
} else {
|
|
||||||
std::cerr << "Error freeing cache memory.\n";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
bool setCacheNgram(const wordID_t* ngram, int len, T value, const void* state) {
|
|
||||||
// inserts full ngram into cache
|
|
||||||
CacheNode<T>* node = root_;
|
|
||||||
for (int i = len - 1; i > -1; --i) {
|
|
||||||
childPtr child = node->childs_.find(ngram[i]);
|
|
||||||
if( child != node->childs_.end() ) {
|
|
||||||
// current node is already prefix. Go to child node
|
|
||||||
node = node->childs_[ngram[i]];
|
|
||||||
} else {
|
|
||||||
// no child for prefix. set new child link in current node
|
|
||||||
CacheNode<T> * newChild = newNode(node);
|
|
||||||
node->childs_[ngram[i]] = newChild;
|
|
||||||
// go to new node
|
|
||||||
node = newChild;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
node->value_ = value;
|
|
||||||
node->state_ = state;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
bool checkCacheNgram(const wordID_t* ngram, int len, T* value, const void** state) {
|
|
||||||
// finds value for this full ngram only (returns false if full ngram not in cache)
|
|
||||||
CacheNode<T> * node = root_;
|
|
||||||
for(int i = len - 1; i > -1; --i) {
|
|
||||||
// go to deepest level node of ngram in cache
|
|
||||||
childPtr child = node->childs_.find(ngram[i]);
|
|
||||||
if( child != node->childs_.end() ) {
|
|
||||||
// switch to child node
|
|
||||||
node = node->childs_[ngram[i]];
|
|
||||||
} else {
|
|
||||||
// not cached
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
*value = node->value_;
|
|
||||||
if(state) *state = node->state_;
|
|
||||||
return *value != null_value_ && *value != unknown_value_;
|
|
||||||
}
|
|
||||||
int getCache2(const wordID_t* ngram, int len, T** values, int* found) {
|
|
||||||
// set values array to point to cache value nodes
|
|
||||||
CacheNode<T> * node = root_;
|
|
||||||
*found = 0;
|
|
||||||
//values[0] = &node->value_; // pointer to root node's value
|
|
||||||
bool all_found = true;
|
|
||||||
for(int i = len - 1; i > -1; --i) {
|
|
||||||
// go to deepest level node of ngram in cache
|
|
||||||
childPtr child = node->childs_.find(ngram[i]);
|
|
||||||
if( child != node->childs_.end() ) {
|
|
||||||
// switch to child node
|
|
||||||
node = node->childs_[ngram[i]];
|
|
||||||
// get pointer to value (index by length - 1)
|
|
||||||
values[i] = &node->value_;
|
|
||||||
// if null_value then assume all extensions impossible
|
|
||||||
if (node->value_ == null_value_) {
|
|
||||||
return len - 1 - i; // max length posible
|
|
||||||
}
|
|
||||||
all_found = all_found && (node->value_ != unknown_value_);
|
|
||||||
if (all_found)
|
|
||||||
++(*found);
|
|
||||||
} else {
|
|
||||||
// initialise uncached values
|
|
||||||
CacheNode<T> * newChild = newNode(node);
|
|
||||||
node->childs_[ngram[i]] = newChild;
|
|
||||||
// go to new node
|
|
||||||
node = newChild;
|
|
||||||
values[i] = &node->value_;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return len; // all possible
|
|
||||||
}
|
|
||||||
int getCache(const wordID_t* ngram, int len, T** values, int* found) {
|
|
||||||
// get pointers to values for ngram and constituents.
|
|
||||||
// returns upper bound on longest subngram in model.
|
|
||||||
// 'found' stores longest non-null and known value found.
|
|
||||||
CacheNode<T> * node = root_;
|
|
||||||
*found = 0;
|
|
||||||
values[0] = &node->value_; // pointer to root node's value
|
|
||||||
bool all_found = true;
|
|
||||||
for(int i = len - 1; i > -1; --i) {
|
|
||||||
// go to deepest level node of ngram in cache
|
|
||||||
childPtr child = node->childs_.find(ngram[i]);
|
|
||||||
if( child != node->childs_.end() ) {
|
|
||||||
// switch to child node
|
|
||||||
node = node->childs_[ngram[i]];
|
|
||||||
// get pointer to value (index by length - 1)
|
|
||||||
values[len - i] = &node->value_;
|
|
||||||
// if null_value then assume all extensions impossible
|
|
||||||
if (node->value_ == null_value_)
|
|
||||||
return len - 1 - i; // max length posible
|
|
||||||
all_found = all_found && (node->value_ != unknown_value_);
|
|
||||||
if (all_found)
|
|
||||||
++(*found);
|
|
||||||
} else {
|
|
||||||
// initialise uncached values
|
|
||||||
CacheNode<T> * newChild = newNode(node);
|
|
||||||
node->childs_[ngram[i]] = newChild;
|
|
||||||
// go to new node
|
|
||||||
node = newChild;
|
|
||||||
values[len - i] = &node->value_;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return len; // all possible
|
|
||||||
}
|
|
||||||
bool clear() {
|
|
||||||
std::cerr << "Clearing cache with " << static_cast<float>(cur_nodes_ * nodeSize())
|
|
||||||
/ static_cast<float>(1ull << 20) << "MB" << std::endl;
|
|
||||||
return clearNodes(root_);
|
|
||||||
}
|
|
||||||
int nodes() {
|
|
||||||
// returns number of nodes
|
|
||||||
return cur_nodes_;
|
|
||||||
}
|
|
||||||
int nodeSize() {
|
|
||||||
return sizeof(CacheNode<T>) + sizeof(root_->childs_);
|
|
||||||
}
|
|
||||||
private:
|
|
||||||
CacheNode<T> * root_;
|
|
||||||
count_t cur_nodes_;
|
|
||||||
T unknown_value_; // Used to initialise data at each node
|
|
||||||
T null_value_; // Indicates cached something not in model
|
|
||||||
CacheNode<T>* newNode(CacheNode<T> * node = 0) {
|
|
||||||
++cur_nodes_;
|
|
||||||
return new CacheNode<T>(unknown_value_);
|
|
||||||
}
|
|
||||||
bool clearNodes(CacheNode<T> * node) {
|
|
||||||
//delete children from this node
|
|
||||||
if(!node->childs_.empty()) {
|
|
||||||
iterate(node->childs_, itr) {
|
|
||||||
if(!clearNodes(itr->second))
|
|
||||||
std::cerr << "Error emptying cache\n";
|
|
||||||
delete itr->second;
|
|
||||||
--cur_nodes_;
|
|
||||||
}
|
|
||||||
node->childs_.clear();
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
};
|
template<typename T>
|
||||||
|
class CacheNode
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
typedef std::map<wordID_t, CacheNode<T>* > childMap;
|
||||||
|
// initialise value to 'unknown' (i.e. not yet queried or cached).
|
||||||
|
CacheNode(T unknown_value) : value_(unknown_value) {}
|
||||||
|
childMap childs_; // child pointers
|
||||||
|
T value_; // value stored
|
||||||
|
const void* state_; // state pointer
|
||||||
|
};
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
class Cache
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
typedef typename std::map<wordID_t, CacheNode<T>* >::iterator childPtr;
|
||||||
|
// unknown_value is used to indicate the ngram was not queried (yet)
|
||||||
|
// null_value_ indicates it was queried but not found in model
|
||||||
|
// space usage is handled by client.
|
||||||
|
Cache(T unknown_value, T null_value) :
|
||||||
|
cur_nodes_(0), unknown_value_(unknown_value), null_value_(null_value) {
|
||||||
|
root_ = newNode();
|
||||||
|
}
|
||||||
|
~Cache() {
|
||||||
|
if(clear()) {
|
||||||
|
delete root_;
|
||||||
|
root_ = NULL;
|
||||||
|
} else {
|
||||||
|
std::cerr << "Error freeing cache memory.\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
bool setCacheNgram(const wordID_t* ngram, int len, T value, const void* state) {
|
||||||
|
// inserts full ngram into cache
|
||||||
|
CacheNode<T>* node = root_;
|
||||||
|
for (int i = len - 1; i > -1; --i) {
|
||||||
|
childPtr child = node->childs_.find(ngram[i]);
|
||||||
|
if( child != node->childs_.end() ) {
|
||||||
|
// current node is already prefix. Go to child node
|
||||||
|
node = node->childs_[ngram[i]];
|
||||||
|
} else {
|
||||||
|
// no child for prefix. set new child link in current node
|
||||||
|
CacheNode<T> * newChild = newNode(node);
|
||||||
|
node->childs_[ngram[i]] = newChild;
|
||||||
|
// go to new node
|
||||||
|
node = newChild;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
node->value_ = value;
|
||||||
|
node->state_ = state;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
bool checkCacheNgram(const wordID_t* ngram, int len, T* value, const void** state) {
|
||||||
|
// finds value for this full ngram only (returns false if full ngram not in cache)
|
||||||
|
CacheNode<T> * node = root_;
|
||||||
|
for(int i = len - 1; i > -1; --i) {
|
||||||
|
// go to deepest level node of ngram in cache
|
||||||
|
childPtr child = node->childs_.find(ngram[i]);
|
||||||
|
if( child != node->childs_.end() ) {
|
||||||
|
// switch to child node
|
||||||
|
node = node->childs_[ngram[i]];
|
||||||
|
} else {
|
||||||
|
// not cached
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*value = node->value_;
|
||||||
|
if(state) *state = node->state_;
|
||||||
|
return *value != null_value_ && *value != unknown_value_;
|
||||||
|
}
|
||||||
|
int getCache2(const wordID_t* ngram, int len, T** values, int* found) {
|
||||||
|
// set values array to point to cache value nodes
|
||||||
|
CacheNode<T> * node = root_;
|
||||||
|
*found = 0;
|
||||||
|
//values[0] = &node->value_; // pointer to root node's value
|
||||||
|
bool all_found = true;
|
||||||
|
for(int i = len - 1; i > -1; --i) {
|
||||||
|
// go to deepest level node of ngram in cache
|
||||||
|
childPtr child = node->childs_.find(ngram[i]);
|
||||||
|
if( child != node->childs_.end() ) {
|
||||||
|
// switch to child node
|
||||||
|
node = node->childs_[ngram[i]];
|
||||||
|
// get pointer to value (index by length - 1)
|
||||||
|
values[i] = &node->value_;
|
||||||
|
// if null_value then assume all extensions impossible
|
||||||
|
if (node->value_ == null_value_) {
|
||||||
|
return len - 1 - i; // max length posible
|
||||||
|
}
|
||||||
|
all_found = all_found && (node->value_ != unknown_value_);
|
||||||
|
if (all_found)
|
||||||
|
++(*found);
|
||||||
|
} else {
|
||||||
|
// initialise uncached values
|
||||||
|
CacheNode<T> * newChild = newNode(node);
|
||||||
|
node->childs_[ngram[i]] = newChild;
|
||||||
|
// go to new node
|
||||||
|
node = newChild;
|
||||||
|
values[i] = &node->value_;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return len; // all possible
|
||||||
|
}
|
||||||
|
int getCache(const wordID_t* ngram, int len, T** values, int* found) {
|
||||||
|
// get pointers to values for ngram and constituents.
|
||||||
|
// returns upper bound on longest subngram in model.
|
||||||
|
// 'found' stores longest non-null and known value found.
|
||||||
|
CacheNode<T> * node = root_;
|
||||||
|
*found = 0;
|
||||||
|
values[0] = &node->value_; // pointer to root node's value
|
||||||
|
bool all_found = true;
|
||||||
|
for(int i = len - 1; i > -1; --i) {
|
||||||
|
// go to deepest level node of ngram in cache
|
||||||
|
childPtr child = node->childs_.find(ngram[i]);
|
||||||
|
if( child != node->childs_.end() ) {
|
||||||
|
// switch to child node
|
||||||
|
node = node->childs_[ngram[i]];
|
||||||
|
// get pointer to value (index by length - 1)
|
||||||
|
values[len - i] = &node->value_;
|
||||||
|
// if null_value then assume all extensions impossible
|
||||||
|
if (node->value_ == null_value_)
|
||||||
|
return len - 1 - i; // max length posible
|
||||||
|
all_found = all_found && (node->value_ != unknown_value_);
|
||||||
|
if (all_found)
|
||||||
|
++(*found);
|
||||||
|
} else {
|
||||||
|
// initialise uncached values
|
||||||
|
CacheNode<T> * newChild = newNode(node);
|
||||||
|
node->childs_[ngram[i]] = newChild;
|
||||||
|
// go to new node
|
||||||
|
node = newChild;
|
||||||
|
values[len - i] = &node->value_;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return len; // all possible
|
||||||
|
}
|
||||||
|
bool clear() {
|
||||||
|
std::cerr << "Clearing cache with " << static_cast<float>(cur_nodes_ * nodeSize())
|
||||||
|
/ static_cast<float>(1ull << 20) << "MB" << std::endl;
|
||||||
|
return clearNodes(root_);
|
||||||
|
}
|
||||||
|
int nodes() {
|
||||||
|
// returns number of nodes
|
||||||
|
return cur_nodes_;
|
||||||
|
}
|
||||||
|
int nodeSize() {
|
||||||
|
return sizeof(CacheNode<T>) + sizeof(root_->childs_);
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
CacheNode<T> * root_;
|
||||||
|
count_t cur_nodes_;
|
||||||
|
T unknown_value_; // Used to initialise data at each node
|
||||||
|
T null_value_; // Indicates cached something not in model
|
||||||
|
CacheNode<T>* newNode(CacheNode<T> * node = 0) {
|
||||||
|
++cur_nodes_;
|
||||||
|
return new CacheNode<T>(unknown_value_);
|
||||||
|
}
|
||||||
|
bool clearNodes(CacheNode<T> * node) {
|
||||||
|
//delete children from this node
|
||||||
|
if(!node->childs_.empty()) {
|
||||||
|
iterate(node->childs_, itr) {
|
||||||
|
if(!clearNodes(itr->second))
|
||||||
|
std::cerr << "Error emptying cache\n";
|
||||||
|
delete itr->second;
|
||||||
|
--cur_nodes_;
|
||||||
|
}
|
||||||
|
node->childs_.clear();
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
} //end namespace
|
} //end namespace
|
||||||
#endif //INC_RANDLM_CACHE_H
|
#endif //INC_RANDLM_CACHE_H
|
||||||
|
@ -20,295 +20,306 @@
|
|||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include "file.h"
|
#include "file.h"
|
||||||
|
|
||||||
namespace randlm {
|
namespace randlm
|
||||||
|
{
|
||||||
// Class Filter wraps a contiguous array of data. Filter and its subclasses
|
|
||||||
// implement read/write/increment functionality on arrays with arbitrary sized addresses
|
|
||||||
// (i.e. an address may not use a full number of bytes). When converting to byte-based
|
|
||||||
// representation we assume "unused" bits are to left.
|
|
||||||
// E.g. if the underlying data is stored in units T = uint16 and the 'width' = 11
|
|
||||||
// to read 'address' = 3 we extract bits at indices [33,42] (i.e. [11*3, 11*4 - 1])
|
|
||||||
// and store in a uint16 in positions 0000 0001 111111 where the first 7 bits have
|
|
||||||
// been masked out.
|
|
||||||
template<typename T>
|
|
||||||
class Filter {
|
|
||||||
public:
|
|
||||||
Filter(uint64_t addresses, int width) : addresses_(addresses), width_(width), data_(NULL) {
|
|
||||||
// number of bits in T
|
|
||||||
cell_width_ = sizeof(T) << 3;
|
|
||||||
// current implementation has following constraints
|
|
||||||
CHECK(cell_width_ > 0 && cell_width_ <= 64 && cell_width_ >= width);
|
|
||||||
// used for >> division
|
|
||||||
log_cell_width_ = static_cast<int>(floor(log(cell_width_)/log(2) + 0.000001));
|
|
||||||
// size of underlying data in Ts
|
|
||||||
cells_ = ((addresses * width) + cell_width_ - 1) >> log_cell_width_;
|
|
||||||
// instantiate underlying data
|
|
||||||
data_ = new T[cells_];
|
|
||||||
CHECK(data_ != NULL);
|
|
||||||
CHECK(reset());
|
|
||||||
// 'first_bit' marks the first bit used by 'address' (left padded with zeros).
|
|
||||||
first_bit_ = (width % cell_width_ == 0) ? 0 : cell_width_ - (width % cell_width_);
|
|
||||||
// mask for full cell
|
|
||||||
full_mask_ = static_cast<T>(0xffffffffffffffffull);
|
|
||||||
// mask for bits that make up the address
|
|
||||||
address_mask_ = full_mask_ >> first_bit_;
|
|
||||||
}
|
|
||||||
Filter(FileHandler* fin, bool loaddata = true) : data_(NULL) {
|
|
||||||
CHECK(loadHeader(fin));
|
|
||||||
if (loaddata)
|
|
||||||
CHECK(loadData(fin));
|
|
||||||
}
|
|
||||||
virtual ~Filter() {
|
|
||||||
delete[] data_;
|
|
||||||
}
|
|
||||||
bool reset() {
|
|
||||||
for (uint64_t i = 0; i < cells_; ++i)
|
|
||||||
data_[i] = 0;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
count_t size() {
|
|
||||||
// return approx size of filter in MBs
|
|
||||||
return cells_ * sizeof(T) >> 20;
|
|
||||||
}
|
|
||||||
// read / write functions
|
|
||||||
inline bool read(uint64_t address, T* value) {
|
|
||||||
CHECK(address <= addresses_);
|
|
||||||
// copy address to 'value'
|
|
||||||
uint64_t data_bit = address * width_;
|
|
||||||
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
|
|
||||||
// 'offset' shows how address in 'data' and 'value' align
|
|
||||||
int offset = (data_bit % cell_width_) - first_bit_;
|
|
||||||
// they align so just copy across masking unneeded leading bits
|
|
||||||
if (offset == 0) {
|
|
||||||
*value = data_[data_cell] & address_mask_;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
// data address starts to left so shift it right
|
|
||||||
if (offset < 0) {
|
|
||||||
*value = (data_[data_cell] >> -offset) & address_mask_;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
// data address is to right so shift it left and look at one more cell to right
|
|
||||||
*value = ((data_[data_cell] << offset)
|
|
||||||
| (data_[data_cell + 1] >> (cell_width_ - offset))) & address_mask_ ;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
inline T read(uint64_t address) {
|
|
||||||
CHECK(address <= addresses_);
|
|
||||||
// return value at address
|
|
||||||
T value = 0;
|
|
||||||
uint64_t data_bit = address * width_;
|
|
||||||
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
|
|
||||||
// 'offset' shows how address in 'data' and 'value' align
|
|
||||||
int offset = (data_bit % cell_width_) - first_bit_;
|
|
||||||
// they align so just copy across masking unneeded leading bits
|
|
||||||
if (offset == 0) {
|
|
||||||
value = data_[data_cell] & address_mask_;
|
|
||||||
}
|
|
||||||
// data address starts to left so shift it right
|
|
||||||
else if (offset < 0) {
|
|
||||||
value = (data_[data_cell] >> -offset) & address_mask_;
|
|
||||||
}
|
|
||||||
// data address is to right so shift it left and look at one more cell to right
|
|
||||||
else
|
|
||||||
value = ((data_[data_cell] << offset)
|
|
||||||
| (data_[data_cell + 1] >> (cell_width_ - offset))) & address_mask_ ;
|
|
||||||
return value;
|
|
||||||
}
|
|
||||||
inline bool write(uint64_t address, T value) {
|
|
||||||
CHECK(address <= addresses_);
|
|
||||||
CHECK(log2(value) <= width_);
|
|
||||||
// write 'value' to address
|
|
||||||
uint64_t data_bit = address * width_;
|
|
||||||
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
|
|
||||||
// 'offset' shows how address in 'data' and 'value' align
|
|
||||||
int offset = (data_bit % cell_width_) - first_bit_;
|
|
||||||
// they align so just copy across masking unneeded leading zeros of value
|
|
||||||
if (offset == 0) {
|
|
||||||
data_[data_cell] = value | (data_[data_cell] & ~address_mask_);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
// address in data is to left so shift value left by -offset
|
|
||||||
if (offset < 0) {
|
|
||||||
data_[data_cell] = (value << -offset)
|
|
||||||
| (data_[data_cell] & ~(address_mask_ << -offset));
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
// address in data is to right so shift value right by offset
|
|
||||||
data_[data_cell] = (value >> offset) |
|
|
||||||
(data_[data_cell] & ~(address_mask_ >> offset));
|
|
||||||
data_[data_cell + 1] = (value << (cell_width_ - offset)) |
|
|
||||||
(data_[data_cell + 1] & (full_mask_ >> offset));
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
inline bool readWithFingerprint(uint64_t address, T finger, T* value) {
|
|
||||||
// copy 'address' ^ 'finger' to 'value'
|
|
||||||
uint64_t data_bit = address * width_;
|
|
||||||
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
|
|
||||||
// 'offset' shows how address in 'data' and 'value' align
|
|
||||||
int offset = (data_bit % cell_width_) - first_bit_;
|
|
||||||
// they align so just copy across masking unneeded leading bits
|
|
||||||
if (offset == 0) {
|
|
||||||
*value = (finger ^ data_[data_cell]) & address_mask_;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
// data address starts to left so shift it right
|
|
||||||
if (offset < 0) {
|
|
||||||
*value = ((data_[data_cell] >> -offset) ^ finger) & address_mask_;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
// data address is to right so shift it left and look at one more cell to right
|
|
||||||
*value = (((data_[data_cell] << offset)
|
|
||||||
| (data_[data_cell + 1] >> (cell_width_ - offset))) ^ finger)
|
|
||||||
& address_mask_ ;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
inline bool writeWithFingerprint(uint64_t address, T finger, T value) {
|
|
||||||
// write 'value' ^ 'finger' to address
|
|
||||||
finger &= address_mask_; // make sure fingerprint is correct size
|
|
||||||
uint64_t data_bit = address * width_;
|
|
||||||
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
|
|
||||||
// 'offset' shows how address in 'data' and 'value' align
|
|
||||||
int offset = (data_bit % cell_width_) - first_bit_;
|
|
||||||
// they align so just copy across masking unneeded leading zeros of value
|
|
||||||
if (offset == 0) {
|
|
||||||
data_[data_cell] = (finger ^ value) | (data_[data_cell] & ~address_mask_);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
// address in data is to left so shift value left by -offset
|
|
||||||
if (offset < 0) {
|
|
||||||
data_[data_cell] = ((finger ^ value) << -offset)
|
|
||||||
| (data_[data_cell] & ~(address_mask_ << -offset));
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
// address in data is to right so shift value right by offset
|
|
||||||
data_[data_cell] = ((finger ^ value) >> offset) |
|
|
||||||
(data_[data_cell] & ~(address_mask_ >> offset));
|
|
||||||
data_[data_cell + 1] = ((finger ^ value) << (cell_width_ - offset)) |
|
|
||||||
(data_[data_cell + 1] & (full_mask_ >> offset));
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
// debugging
|
|
||||||
void printFilter(const std::string & prefix = "", uint32_t truncate = 64){
|
|
||||||
std::cout << prefix;
|
|
||||||
for (uint32_t i = 0; i < cells_ && i < truncate; ++i) {
|
|
||||||
for (int j = cell_width_ - 1; j >= 0; --j)
|
|
||||||
if (data_[i] & (1ull << j))
|
|
||||||
std::cout << 1;
|
|
||||||
else
|
|
||||||
std::cout << 0;
|
|
||||||
std::cout << "\n";
|
|
||||||
}
|
|
||||||
std::cout << std::endl;
|
|
||||||
}
|
|
||||||
// i/o
|
|
||||||
uint64_t getAddresses() { return addresses_; }
|
|
||||||
int getWidth() { return width_; }
|
|
||||||
int getCellWidth() { return cell_width_; }
|
|
||||||
uint32_t getCells() { return cells_; }
|
|
||||||
virtual bool save(FileHandler* out) {
|
|
||||||
CHECK(out != NULL);
|
|
||||||
CHECK(out->write((char*)&cells_, sizeof(cells_)));
|
|
||||||
CHECK(out->write((char*)&cell_width_, sizeof(cell_width_)));
|
|
||||||
CHECK(out->write((char*)&log_cell_width_, sizeof(log_cell_width_)));
|
|
||||||
CHECK(out->write((char*)&addresses_, sizeof(addresses_)));
|
|
||||||
CHECK(out->write((char*)&width_, sizeof(width_)));
|
|
||||||
CHECK(out->write((char*)&first_bit_, sizeof(first_bit_)));
|
|
||||||
CHECK(out->write((char*)&full_mask_, sizeof(full_mask_)));
|
|
||||||
CHECK(out->write((char*)&address_mask_, sizeof(address_mask_)));
|
|
||||||
//CHECK(out->write((char*)data_, cells_ * sizeof(T)));
|
|
||||||
const uint64_t jump = 524288032ul; //(uint64_t)pow(2, 29);
|
|
||||||
if((width_ == 1) || cells_ < jump)
|
|
||||||
CHECK(out->write((char*)data_, cells_ * sizeof(T)));
|
|
||||||
else {
|
|
||||||
uint64_t idx(0);
|
|
||||||
while(idx + jump < cells_) {
|
|
||||||
CHECK(out->write((char*)&data_[idx], jump * sizeof(T)));
|
|
||||||
idx += jump;
|
|
||||||
}
|
|
||||||
CHECK(out->write((char*)&data_[idx], (cells_ - idx) * sizeof(T)));
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
protected:
|
|
||||||
bool loadHeader(FileHandler* fin) {
|
|
||||||
CHECK(fin != NULL);
|
|
||||||
CHECK(fin->read((char*)&cells_, sizeof(cells_)));
|
|
||||||
CHECK(fin->read((char*)&cell_width_, sizeof(cell_width_)));
|
|
||||||
CHECK(cell_width_ == sizeof(T) << 3); // make sure correct underlying data type
|
|
||||||
CHECK(fin->read((char*)&log_cell_width_, sizeof(log_cell_width_)));
|
|
||||||
CHECK(fin->read((char*)&addresses_, sizeof(addresses_)));
|
|
||||||
CHECK(fin->read((char*)&width_, sizeof(width_)));
|
|
||||||
CHECK(fin->read((char*)&first_bit_, sizeof(first_bit_)));
|
|
||||||
CHECK(fin->read((char*)&full_mask_, sizeof(full_mask_)));
|
|
||||||
CHECK(fin->read((char*)&address_mask_, sizeof(address_mask_)));
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
bool loadData(FileHandler* fin) {
|
|
||||||
// instantiate underlying array
|
|
||||||
data_ = new T[cells_];
|
|
||||||
CHECK(data_ != NULL);
|
|
||||||
CHECK(fin->read((char*)data_, cells_ * sizeof(T)));
|
|
||||||
//CHECK(fin->read((char*)&data_[0], ceil(float(cells_) / 2.0) * sizeof(T)));
|
|
||||||
//CHECK(fin->read((char*)&data_[cells_ / 2], (cells_ / 2) * sizeof(T)));
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
uint64_t cells_; // number T making up 'data_'
|
|
||||||
int cell_width_; // bits per cell (i.e. sizeof(T) << 3)
|
|
||||||
int log_cell_width_; // log of bits used for >> division
|
|
||||||
uint64_t addresses_; // number of addresses in the filter
|
|
||||||
int width_; // width in bits of each address
|
|
||||||
int first_bit_; // position of first bit in initial byte
|
|
||||||
T full_mask_; // all 1s
|
|
||||||
T address_mask_; // 1s in those positions that are part of address
|
|
||||||
T* data_; // the raw data as bytes
|
|
||||||
};
|
|
||||||
|
|
||||||
// Extension with bit test/setter methods added
|
// Class Filter wraps a contiguous array of data. Filter and its subclasses
|
||||||
class BitFilter : public Filter<uint8_t> {
|
// implement read/write/increment functionality on arrays with arbitrary sized addresses
|
||||||
public:
|
// (i.e. an address may not use a full number of bytes). When converting to byte-based
|
||||||
BitFilter(uint64_t bits) : Filter<uint8_t>(bits, 1) {}
|
// representation we assume "unused" bits are to left.
|
||||||
BitFilter(FileHandler* fin, bool loaddata = true)
|
// E.g. if the underlying data is stored in units T = uint16 and the 'width' = 11
|
||||||
: Filter<uint8_t>(fin, loaddata) {
|
// to read 'address' = 3 we extract bits at indices [33,42] (i.e. [11*3, 11*4 - 1])
|
||||||
if (loaddata)
|
// and store in a uint16 in positions 0000 0001 111111 where the first 7 bits have
|
||||||
CHECK(load(fin));
|
// been masked out.
|
||||||
}
|
template<typename T>
|
||||||
// TODO: overload operator[]
|
class Filter
|
||||||
virtual bool testBit(uint64_t location) {
|
{
|
||||||
// test bit referenced by location
|
public:
|
||||||
return data_[(location % addresses_) >> 3] & 1 << ((location % addresses_) % 8);
|
Filter(uint64_t addresses, int width) : addresses_(addresses), width_(width), data_(NULL) {
|
||||||
}
|
// number of bits in T
|
||||||
virtual bool setBit(uint64_t location) {
|
cell_width_ = sizeof(T) << 3;
|
||||||
// set bit referenced by location
|
// current implementation has following constraints
|
||||||
data_[(location % addresses_) >> 3] |= 1 << ((location % addresses_) % 8);
|
CHECK(cell_width_ > 0 && cell_width_ <= 64 && cell_width_ >= width);
|
||||||
|
// used for >> division
|
||||||
|
log_cell_width_ = static_cast<int>(floor(log(cell_width_)/log(2) + 0.000001));
|
||||||
|
// size of underlying data in Ts
|
||||||
|
cells_ = ((addresses * width) + cell_width_ - 1) >> log_cell_width_;
|
||||||
|
// instantiate underlying data
|
||||||
|
data_ = new T[cells_];
|
||||||
|
CHECK(data_ != NULL);
|
||||||
|
CHECK(reset());
|
||||||
|
// 'first_bit' marks the first bit used by 'address' (left padded with zeros).
|
||||||
|
first_bit_ = (width % cell_width_ == 0) ? 0 : cell_width_ - (width % cell_width_);
|
||||||
|
// mask for full cell
|
||||||
|
full_mask_ = static_cast<T>(0xffffffffffffffffull);
|
||||||
|
// mask for bits that make up the address
|
||||||
|
address_mask_ = full_mask_ >> first_bit_;
|
||||||
|
}
|
||||||
|
Filter(FileHandler* fin, bool loaddata = true) : data_(NULL) {
|
||||||
|
CHECK(loadHeader(fin));
|
||||||
|
if (loaddata)
|
||||||
|
CHECK(loadData(fin));
|
||||||
|
}
|
||||||
|
virtual ~Filter() {
|
||||||
|
delete[] data_;
|
||||||
|
}
|
||||||
|
bool reset() {
|
||||||
|
for (uint64_t i = 0; i < cells_; ++i)
|
||||||
|
data_[i] = 0;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
count_t size() {
|
||||||
|
// return approx size of filter in MBs
|
||||||
|
return cells_ * sizeof(T) >> 20;
|
||||||
|
}
|
||||||
|
// read / write functions
|
||||||
|
inline bool read(uint64_t address, T* value) {
|
||||||
|
CHECK(address <= addresses_);
|
||||||
|
// copy address to 'value'
|
||||||
|
uint64_t data_bit = address * width_;
|
||||||
|
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
|
||||||
|
// 'offset' shows how address in 'data' and 'value' align
|
||||||
|
int offset = (data_bit % cell_width_) - first_bit_;
|
||||||
|
// they align so just copy across masking unneeded leading bits
|
||||||
|
if (offset == 0) {
|
||||||
|
*value = data_[data_cell] & address_mask_;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
virtual bool clearBit(uint64_t location) {
|
// data address starts to left so shift it right
|
||||||
// set bit referenced by location
|
if (offset < 0) {
|
||||||
data_[(location % addresses_) >> 3] &= 0 << ((location % addresses_) % 8);
|
*value = (data_[data_cell] >> -offset) & address_mask_;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
bool save(FileHandler* fout) {
|
// data address is to right so shift it left and look at one more cell to right
|
||||||
CHECK(Filter<uint8_t>::save(fout));
|
*value = ((data_[data_cell] << offset)
|
||||||
std::cerr << "Saved BitFilter. Rho = " << rho() << "." << std::endl;;
|
| (data_[data_cell + 1] >> (cell_width_ - offset))) & address_mask_ ;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
inline T read(uint64_t address) {
|
||||||
|
CHECK(address <= addresses_);
|
||||||
|
// return value at address
|
||||||
|
T value = 0;
|
||||||
|
uint64_t data_bit = address * width_;
|
||||||
|
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
|
||||||
|
// 'offset' shows how address in 'data' and 'value' align
|
||||||
|
int offset = (data_bit % cell_width_) - first_bit_;
|
||||||
|
// they align so just copy across masking unneeded leading bits
|
||||||
|
if (offset == 0) {
|
||||||
|
value = data_[data_cell] & address_mask_;
|
||||||
|
}
|
||||||
|
// data address starts to left so shift it right
|
||||||
|
else if (offset < 0) {
|
||||||
|
value = (data_[data_cell] >> -offset) & address_mask_;
|
||||||
|
}
|
||||||
|
// data address is to right so shift it left and look at one more cell to right
|
||||||
|
else
|
||||||
|
value = ((data_[data_cell] << offset)
|
||||||
|
| (data_[data_cell + 1] >> (cell_width_ - offset))) & address_mask_ ;
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
inline bool write(uint64_t address, T value) {
|
||||||
|
CHECK(address <= addresses_);
|
||||||
|
CHECK(log2(value) <= width_);
|
||||||
|
// write 'value' to address
|
||||||
|
uint64_t data_bit = address * width_;
|
||||||
|
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
|
||||||
|
// 'offset' shows how address in 'data' and 'value' align
|
||||||
|
int offset = (data_bit % cell_width_) - first_bit_;
|
||||||
|
// they align so just copy across masking unneeded leading zeros of value
|
||||||
|
if (offset == 0) {
|
||||||
|
data_[data_cell] = value | (data_[data_cell] & ~address_mask_);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
float rho(uint64_t limit = 0) {
|
// address in data is to left so shift value left by -offset
|
||||||
uint64_t ones = 0;
|
if (offset < 0) {
|
||||||
uint64_t range = limit > 0 ? std::min(limit,cells_) : cells_;
|
data_[data_cell] = (value << -offset)
|
||||||
for (uint64_t i = 0; i < range; ++i)
|
| (data_[data_cell] & ~(address_mask_ << -offset));
|
||||||
for (int j = 0; j < 8; ++j)
|
|
||||||
if (data_[i] & (1 << j))
|
|
||||||
++ones;
|
|
||||||
return static_cast<float>((range << 3) - ones)/static_cast<float>(range << 3);
|
|
||||||
}
|
|
||||||
protected:
|
|
||||||
bool load(FileHandler* fin) {
|
|
||||||
std::cerr << "Loaded BitFilter. Rho = " << rho() << "." << std::endl;;
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
};
|
// address in data is to right so shift value right by offset
|
||||||
/*
|
data_[data_cell] = (value >> offset) |
|
||||||
|
(data_[data_cell] & ~(address_mask_ >> offset));
|
||||||
|
data_[data_cell + 1] = (value << (cell_width_ - offset)) |
|
||||||
|
(data_[data_cell + 1] & (full_mask_ >> offset));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
inline bool readWithFingerprint(uint64_t address, T finger, T* value) {
|
||||||
|
// copy 'address' ^ 'finger' to 'value'
|
||||||
|
uint64_t data_bit = address * width_;
|
||||||
|
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
|
||||||
|
// 'offset' shows how address in 'data' and 'value' align
|
||||||
|
int offset = (data_bit % cell_width_) - first_bit_;
|
||||||
|
// they align so just copy across masking unneeded leading bits
|
||||||
|
if (offset == 0) {
|
||||||
|
*value = (finger ^ data_[data_cell]) & address_mask_;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
// data address starts to left so shift it right
|
||||||
|
if (offset < 0) {
|
||||||
|
*value = ((data_[data_cell] >> -offset) ^ finger) & address_mask_;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
// data address is to right so shift it left and look at one more cell to right
|
||||||
|
*value = (((data_[data_cell] << offset)
|
||||||
|
| (data_[data_cell + 1] >> (cell_width_ - offset))) ^ finger)
|
||||||
|
& address_mask_ ;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
inline bool writeWithFingerprint(uint64_t address, T finger, T value) {
|
||||||
|
// write 'value' ^ 'finger' to address
|
||||||
|
finger &= address_mask_; // make sure fingerprint is correct size
|
||||||
|
uint64_t data_bit = address * width_;
|
||||||
|
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
|
||||||
|
// 'offset' shows how address in 'data' and 'value' align
|
||||||
|
int offset = (data_bit % cell_width_) - first_bit_;
|
||||||
|
// they align so just copy across masking unneeded leading zeros of value
|
||||||
|
if (offset == 0) {
|
||||||
|
data_[data_cell] = (finger ^ value) | (data_[data_cell] & ~address_mask_);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
// address in data is to left so shift value left by -offset
|
||||||
|
if (offset < 0) {
|
||||||
|
data_[data_cell] = ((finger ^ value) << -offset)
|
||||||
|
| (data_[data_cell] & ~(address_mask_ << -offset));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
// address in data is to right so shift value right by offset
|
||||||
|
data_[data_cell] = ((finger ^ value) >> offset) |
|
||||||
|
(data_[data_cell] & ~(address_mask_ >> offset));
|
||||||
|
data_[data_cell + 1] = ((finger ^ value) << (cell_width_ - offset)) |
|
||||||
|
(data_[data_cell + 1] & (full_mask_ >> offset));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
// debugging
|
||||||
|
void printFilter(const std::string & prefix = "", uint32_t truncate = 64) {
|
||||||
|
std::cout << prefix;
|
||||||
|
for (uint32_t i = 0; i < cells_ && i < truncate; ++i) {
|
||||||
|
for (int j = cell_width_ - 1; j >= 0; --j)
|
||||||
|
if (data_[i] & (1ull << j))
|
||||||
|
std::cout << 1;
|
||||||
|
else
|
||||||
|
std::cout << 0;
|
||||||
|
std::cout << "\n";
|
||||||
|
}
|
||||||
|
std::cout << std::endl;
|
||||||
|
}
|
||||||
|
// i/o
|
||||||
|
uint64_t getAddresses() {
|
||||||
|
return addresses_;
|
||||||
|
}
|
||||||
|
int getWidth() {
|
||||||
|
return width_;
|
||||||
|
}
|
||||||
|
int getCellWidth() {
|
||||||
|
return cell_width_;
|
||||||
|
}
|
||||||
|
uint32_t getCells() {
|
||||||
|
return cells_;
|
||||||
|
}
|
||||||
|
virtual bool save(FileHandler* out) {
|
||||||
|
CHECK(out != NULL);
|
||||||
|
CHECK(out->write((char*)&cells_, sizeof(cells_)));
|
||||||
|
CHECK(out->write((char*)&cell_width_, sizeof(cell_width_)));
|
||||||
|
CHECK(out->write((char*)&log_cell_width_, sizeof(log_cell_width_)));
|
||||||
|
CHECK(out->write((char*)&addresses_, sizeof(addresses_)));
|
||||||
|
CHECK(out->write((char*)&width_, sizeof(width_)));
|
||||||
|
CHECK(out->write((char*)&first_bit_, sizeof(first_bit_)));
|
||||||
|
CHECK(out->write((char*)&full_mask_, sizeof(full_mask_)));
|
||||||
|
CHECK(out->write((char*)&address_mask_, sizeof(address_mask_)));
|
||||||
|
//CHECK(out->write((char*)data_, cells_ * sizeof(T)));
|
||||||
|
const uint64_t jump = 524288032ul; //(uint64_t)pow(2, 29);
|
||||||
|
if((width_ == 1) || cells_ < jump)
|
||||||
|
CHECK(out->write((char*)data_, cells_ * sizeof(T)));
|
||||||
|
else {
|
||||||
|
uint64_t idx(0);
|
||||||
|
while(idx + jump < cells_) {
|
||||||
|
CHECK(out->write((char*)&data_[idx], jump * sizeof(T)));
|
||||||
|
idx += jump;
|
||||||
|
}
|
||||||
|
CHECK(out->write((char*)&data_[idx], (cells_ - idx) * sizeof(T)));
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
protected:
|
||||||
|
bool loadHeader(FileHandler* fin) {
|
||||||
|
CHECK(fin != NULL);
|
||||||
|
CHECK(fin->read((char*)&cells_, sizeof(cells_)));
|
||||||
|
CHECK(fin->read((char*)&cell_width_, sizeof(cell_width_)));
|
||||||
|
CHECK(cell_width_ == sizeof(T) << 3); // make sure correct underlying data type
|
||||||
|
CHECK(fin->read((char*)&log_cell_width_, sizeof(log_cell_width_)));
|
||||||
|
CHECK(fin->read((char*)&addresses_, sizeof(addresses_)));
|
||||||
|
CHECK(fin->read((char*)&width_, sizeof(width_)));
|
||||||
|
CHECK(fin->read((char*)&first_bit_, sizeof(first_bit_)));
|
||||||
|
CHECK(fin->read((char*)&full_mask_, sizeof(full_mask_)));
|
||||||
|
CHECK(fin->read((char*)&address_mask_, sizeof(address_mask_)));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
bool loadData(FileHandler* fin) {
|
||||||
|
// instantiate underlying array
|
||||||
|
data_ = new T[cells_];
|
||||||
|
CHECK(data_ != NULL);
|
||||||
|
CHECK(fin->read((char*)data_, cells_ * sizeof(T)));
|
||||||
|
//CHECK(fin->read((char*)&data_[0], ceil(float(cells_) / 2.0) * sizeof(T)));
|
||||||
|
//CHECK(fin->read((char*)&data_[cells_ / 2], (cells_ / 2) * sizeof(T)));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
uint64_t cells_; // number T making up 'data_'
|
||||||
|
int cell_width_; // bits per cell (i.e. sizeof(T) << 3)
|
||||||
|
int log_cell_width_; // log of bits used for >> division
|
||||||
|
uint64_t addresses_; // number of addresses in the filter
|
||||||
|
int width_; // width in bits of each address
|
||||||
|
int first_bit_; // position of first bit in initial byte
|
||||||
|
T full_mask_; // all 1s
|
||||||
|
T address_mask_; // 1s in those positions that are part of address
|
||||||
|
T* data_; // the raw data as bytes
|
||||||
|
};
|
||||||
|
|
||||||
|
// Extension with bit test/setter methods added
|
||||||
|
class BitFilter : public Filter<uint8_t>
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
BitFilter(uint64_t bits) : Filter<uint8_t>(bits, 1) {}
|
||||||
|
BitFilter(FileHandler* fin, bool loaddata = true)
|
||||||
|
: Filter<uint8_t>(fin, loaddata) {
|
||||||
|
if (loaddata)
|
||||||
|
CHECK(load(fin));
|
||||||
|
}
|
||||||
|
// TODO: overload operator[]
|
||||||
|
virtual bool testBit(uint64_t location) {
|
||||||
|
// test bit referenced by location
|
||||||
|
return data_[(location % addresses_) >> 3] & 1 << ((location % addresses_) % 8);
|
||||||
|
}
|
||||||
|
virtual bool setBit(uint64_t location) {
|
||||||
|
// set bit referenced by location
|
||||||
|
data_[(location % addresses_) >> 3] |= 1 << ((location % addresses_) % 8);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
virtual bool clearBit(uint64_t location) {
|
||||||
|
// set bit referenced by location
|
||||||
|
data_[(location % addresses_) >> 3] &= 0 << ((location % addresses_) % 8);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
bool save(FileHandler* fout) {
|
||||||
|
CHECK(Filter<uint8_t>::save(fout));
|
||||||
|
std::cerr << "Saved BitFilter. Rho = " << rho() << "." << std::endl;;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
float rho(uint64_t limit = 0) {
|
||||||
|
uint64_t ones = 0;
|
||||||
|
uint64_t range = limit > 0 ? std::min(limit,cells_) : cells_;
|
||||||
|
for (uint64_t i = 0; i < range; ++i)
|
||||||
|
for (int j = 0; j < 8; ++j)
|
||||||
|
if (data_[i] & (1 << j))
|
||||||
|
++ones;
|
||||||
|
return static_cast<float>((range << 3) - ones)/static_cast<float>(range << 3);
|
||||||
|
}
|
||||||
|
protected:
|
||||||
|
bool load(FileHandler* fin) {
|
||||||
|
std::cerr << "Loaded BitFilter. Rho = " << rho() << "." << std::endl;;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
/*
|
||||||
// ResizedBitFilter deals with resizing to save memory
|
// ResizedBitFilter deals with resizing to save memory
|
||||||
// whereas other filters should expect locations to be within range
|
// whereas other filters should expect locations to be within range
|
||||||
// this filter will need to resize (and possibly rehash) locations
|
// this filter will need to resize (and possibly rehash) locations
|
||||||
@ -380,9 +391,9 @@ namespace randlm {
|
|||||||
carry = incrementSubCell(data_bit, this->width_, &this->data_[data_cell]);
|
carry = incrementSubCell(data_bit, this->width_, &this->data_[data_cell]);
|
||||||
}
|
}
|
||||||
// last update must not have carried
|
// last update must not have carried
|
||||||
if (!carry)
|
if (!carry)
|
||||||
return true;
|
return true;
|
||||||
// wrapped round so check whether need to reset to max count
|
// wrapped round so check whether need to reset to max count
|
||||||
if (!wrap_around_)
|
if (!wrap_around_)
|
||||||
CHECK(this->write(address, this->address_mask_));
|
CHECK(this->write(address, this->address_mask_));
|
||||||
return false; // false to indicate that overflowed
|
return false; // false to indicate that overflowed
|
||||||
@ -397,7 +408,7 @@ namespace randlm {
|
|||||||
}
|
}
|
||||||
inline bool incrementSubCell(int bit, int len, T* cell) {
|
inline bool incrementSubCell(int bit, int len, T* cell) {
|
||||||
// increment counter consisting of bits [startbit, startbit + len - 1] rest stays unchanged
|
// increment counter consisting of bits [startbit, startbit + len - 1] rest stays unchanged
|
||||||
*cell = ((((*cell >> (this->cell_width_ - bit - len)) + 1)
|
*cell = ((((*cell >> (this->cell_width_ - bit - len)) + 1)
|
||||||
& (this->full_mask_ >> (this->cell_width_ - len))) << (this->cell_width_ - bit - len))
|
& (this->full_mask_ >> (this->cell_width_ - len))) << (this->cell_width_ - bit - len))
|
||||||
| (*cell & ~(((this->full_mask_ >> (this->cell_width_ - len)) << (this->cell_width_ - bit - len))));
|
| (*cell & ~(((this->full_mask_ >> (this->cell_width_ - len)) << (this->cell_width_ - bit - len))));
|
||||||
// indicate overflow as true
|
// indicate overflow as true
|
||||||
|
@ -10,58 +10,66 @@ using namespace Moses;
|
|||||||
typedef uint64_t P; // largest input range is 2^64
|
typedef uint64_t P; // largest input range is 2^64
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
class HashBase {
|
class HashBase
|
||||||
protected:
|
{
|
||||||
T m_; // range of hash output
|
protected:
|
||||||
count_t H_; // number of hash functions to instantiate
|
T m_; // range of hash output
|
||||||
virtual void initSeeds()=0;
|
count_t H_; // number of hash functions to instantiate
|
||||||
virtual void freeSeeds()=0;
|
virtual void initSeeds()=0;
|
||||||
public:
|
virtual void freeSeeds()=0;
|
||||||
HashBase(float m, count_t H=1):m_((T)m), H_(H) {
|
public:
|
||||||
//cerr << "range = (0..." << m_ << "]" << endl;
|
HashBase(float m, count_t H=1):m_((T)m), H_(H) {
|
||||||
}
|
//cerr << "range = (0..." << m_ << "]" << endl;
|
||||||
HashBase(FileHandler* fin) {
|
}
|
||||||
load(fin);
|
HashBase(FileHandler* fin) {
|
||||||
}
|
load(fin);
|
||||||
virtual ~HashBase(){}
|
}
|
||||||
virtual T hash(const char*s, count_t h)=0; // string hashing
|
virtual ~HashBase() {}
|
||||||
virtual T hash(const wordID_t* id, const int len, count_t h)=0; // vocab mapped hashing
|
virtual T hash(const char*s, count_t h)=0; // string hashing
|
||||||
count_t size() { return H_;}
|
virtual T hash(const wordID_t* id, const int len, count_t h)=0; // vocab mapped hashing
|
||||||
virtual void save(FileHandler* fout) {
|
count_t size() {
|
||||||
CHECK(fout != 0);
|
return H_;
|
||||||
fout->write((char*)&m_, sizeof(m_));
|
}
|
||||||
fout->write((char*)&H_, sizeof(H_));
|
virtual void save(FileHandler* fout) {
|
||||||
}
|
CHECK(fout != 0);
|
||||||
virtual void load(FileHandler* fin) {
|
fout->write((char*)&m_, sizeof(m_));
|
||||||
CHECK(fin != 0);
|
fout->write((char*)&H_, sizeof(H_));
|
||||||
fin->read((char*)&m_, sizeof(m_));
|
}
|
||||||
fin->read((char*)&H_, sizeof(H_));
|
virtual void load(FileHandler* fin) {
|
||||||
}
|
CHECK(fin != 0);
|
||||||
|
fin->read((char*)&m_, sizeof(m_));
|
||||||
|
fin->read((char*)&H_, sizeof(H_));
|
||||||
|
}
|
||||||
};
|
};
|
||||||
template <typename T>
|
template <typename T>
|
||||||
class UnivHash_linear: public HashBase<T> {
|
class UnivHash_linear: public HashBase<T>
|
||||||
public:
|
{
|
||||||
UnivHash_linear(float m, count_t H, P pr):
|
public:
|
||||||
HashBase<T>(m, H), pr_(pr) {
|
UnivHash_linear(float m, count_t H, P pr):
|
||||||
//CHECK(isPrime(pr_));
|
HashBase<T>(m, H), pr_(pr) {
|
||||||
initSeeds();
|
//CHECK(isPrime(pr_));
|
||||||
}
|
initSeeds();
|
||||||
UnivHash_linear(FileHandler* fin):
|
}
|
||||||
HashBase<T>(fin) {
|
UnivHash_linear(FileHandler* fin):
|
||||||
load(fin);
|
HashBase<T>(fin) {
|
||||||
}
|
load(fin);
|
||||||
~UnivHash_linear() {freeSeeds();}
|
}
|
||||||
T hash(const char* s, count_t h){return 0;} //not implemented
|
~UnivHash_linear() {
|
||||||
T hash(const wordID_t* id, const int len, count_t h);
|
freeSeeds();
|
||||||
T hash(const wordID_t id, const count_t pos,
|
}
|
||||||
const T prevValue, count_t h);
|
T hash(const char* s, count_t h) {
|
||||||
void save(FileHandler* fout);
|
return 0; //not implemented
|
||||||
void load(FileHandler* fin);
|
}
|
||||||
private:
|
T hash(const wordID_t* id, const int len, count_t h);
|
||||||
T** a_, **b_;
|
T hash(const wordID_t id, const count_t pos,
|
||||||
P pr_;
|
const T prevValue, count_t h);
|
||||||
void initSeeds();
|
void save(FileHandler* fout);
|
||||||
void freeSeeds();
|
void load(FileHandler* fin);
|
||||||
|
private:
|
||||||
|
T** a_, **b_;
|
||||||
|
P pr_;
|
||||||
|
void initSeeds();
|
||||||
|
void freeSeeds();
|
||||||
};
|
};
|
||||||
|
|
||||||
/* UnivHash_noPrimes:
|
/* UnivHash_noPrimes:
|
||||||
@ -71,74 +79,89 @@ class UnivHash_linear: public HashBase<T> {
|
|||||||
* # of hash function = 2^(l-1)
|
* # of hash function = 2^(l-1)
|
||||||
*/
|
*/
|
||||||
template <typename T>
|
template <typename T>
|
||||||
class UnivHash_noPrimes: public HashBase<T> {
|
class UnivHash_noPrimes: public HashBase<T>
|
||||||
public:
|
{
|
||||||
UnivHash_noPrimes(float k, float l):
|
public:
|
||||||
HashBase<T>(k, 100), d_(count_t((l-k))) {
|
UnivHash_noPrimes(float k, float l):
|
||||||
if(((int)l >> 3) == sizeof(P)) p_ = (P) pow(2,l) - 1;
|
HashBase<T>(k, 100), d_(count_t((l-k))) {
|
||||||
else p_ = (P) pow(2,l);
|
if(((int)l >> 3) == sizeof(P)) p_ = (P) pow(2,l) - 1;
|
||||||
initSeeds();
|
else p_ = (P) pow(2,l);
|
||||||
}
|
initSeeds();
|
||||||
UnivHash_noPrimes(FileHandler* fin):
|
}
|
||||||
HashBase<T>(fin) {
|
UnivHash_noPrimes(FileHandler* fin):
|
||||||
load(fin);
|
HashBase<T>(fin) {
|
||||||
}
|
load(fin);
|
||||||
~UnivHash_noPrimes() {freeSeeds();}
|
}
|
||||||
T hash(const char* s, count_t h);
|
~UnivHash_noPrimes() {
|
||||||
T hash(const wordID_t* id, const int len, count_t h);
|
freeSeeds();
|
||||||
T hash(const P x, count_t h);
|
}
|
||||||
void save(FileHandler* fout);
|
T hash(const char* s, count_t h);
|
||||||
void load(FileHandler* fin);
|
T hash(const wordID_t* id, const int len, count_t h);
|
||||||
private:
|
T hash(const P x, count_t h);
|
||||||
count_t d_; // l-k
|
void save(FileHandler* fout);
|
||||||
P p_, *a_; // real-valued input range, storage
|
void load(FileHandler* fin);
|
||||||
void initSeeds();
|
private:
|
||||||
void freeSeeds() {delete[] a_;}
|
count_t d_; // l-k
|
||||||
|
P p_, *a_; // real-valued input range, storage
|
||||||
|
void initSeeds();
|
||||||
|
void freeSeeds() {
|
||||||
|
delete[] a_;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
class Hash_shiftAddXOR: public HashBase<T> {
|
class Hash_shiftAddXOR: public HashBase<T>
|
||||||
public:
|
{
|
||||||
Hash_shiftAddXOR(float m, count_t H=5): HashBase<T>(m,H),
|
public:
|
||||||
l_(5), r_(2) {
|
Hash_shiftAddXOR(float m, count_t H=5): HashBase<T>(m,H),
|
||||||
initSeeds();
|
l_(5), r_(2) {
|
||||||
}
|
initSeeds();
|
||||||
~Hash_shiftAddXOR() {freeSeeds();}
|
}
|
||||||
T hash(const char* s, count_t h);
|
~Hash_shiftAddXOR() {
|
||||||
T hash(const wordID_t* id, const int len, count_t h) {} // empty
|
freeSeeds();
|
||||||
private:
|
}
|
||||||
T* v_; // random seed storage
|
T hash(const char* s, count_t h);
|
||||||
const unsigned short l_, r_; // left-shift bits, right-shift bits
|
T hash(const wordID_t* id, const int len, count_t h) {} // empty
|
||||||
void initSeeds();
|
private:
|
||||||
void freeSeeds() {delete[] v_;}
|
T* v_; // random seed storage
|
||||||
|
const unsigned short l_, r_; // left-shift bits, right-shift bits
|
||||||
|
void initSeeds();
|
||||||
|
void freeSeeds() {
|
||||||
|
delete[] v_;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
class UnivHash_tableXOR: public HashBase<T> {
|
class UnivHash_tableXOR: public HashBase<T>
|
||||||
public:
|
{
|
||||||
UnivHash_tableXOR(float m, count_t H=5): HashBase<T>(m, H),
|
public:
|
||||||
table_(NULL), tblLen_(255*MAX_STR_LEN) {
|
UnivHash_tableXOR(float m, count_t H=5): HashBase<T>(m, H),
|
||||||
initSeeds();
|
table_(NULL), tblLen_(255*MAX_STR_LEN) {
|
||||||
}
|
initSeeds();
|
||||||
~UnivHash_tableXOR() {freeSeeds();}
|
}
|
||||||
T hash(const char* s, count_t h);
|
~UnivHash_tableXOR() {
|
||||||
T hash(const wordID_t* id, const int len, count_t h) {}
|
freeSeeds();
|
||||||
private:
|
}
|
||||||
T** table_; // storage for random numbers
|
T hash(const char* s, count_t h);
|
||||||
count_t tblLen_; // length of table
|
T hash(const wordID_t* id, const int len, count_t h) {}
|
||||||
void initSeeds();
|
private:
|
||||||
void freeSeeds();
|
T** table_; // storage for random numbers
|
||||||
|
count_t tblLen_; // length of table
|
||||||
|
void initSeeds();
|
||||||
|
void freeSeeds();
|
||||||
};
|
};
|
||||||
|
|
||||||
// ShiftAddXor
|
// ShiftAddXor
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void Hash_shiftAddXOR<T>::initSeeds() {
|
void Hash_shiftAddXOR<T>::initSeeds()
|
||||||
|
{
|
||||||
v_ = new T[this->H_];
|
v_ = new T[this->H_];
|
||||||
for(count_t i=0; i < this->H_; i++)
|
for(count_t i=0; i < this->H_; i++)
|
||||||
v_[i] = Utils::rand<T>() + 1;
|
v_[i] = Utils::rand<T>() + 1;
|
||||||
}
|
}
|
||||||
template <typename T>
|
template <typename T>
|
||||||
T Hash_shiftAddXOR<T>::hash(const char* s, count_t h=0) {
|
T Hash_shiftAddXOR<T>::hash(const char* s, count_t h=0)
|
||||||
|
{
|
||||||
T value = v_[h];
|
T value = v_[h];
|
||||||
int pos(0);
|
int pos(0);
|
||||||
unsigned char c;
|
unsigned char c;
|
||||||
@ -150,40 +173,44 @@ T Hash_shiftAddXOR<T>::hash(const char* s, count_t h=0) {
|
|||||||
|
|
||||||
// UnivHash_tableXOR
|
// UnivHash_tableXOR
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void UnivHash_tableXOR<T>::initSeeds() {
|
void UnivHash_tableXOR<T>::initSeeds()
|
||||||
|
{
|
||||||
// delete any values in table
|
// delete any values in table
|
||||||
if(table_) freeSeeds();
|
if(table_) freeSeeds();
|
||||||
// instance of new table
|
// instance of new table
|
||||||
table_ = new T* [this->H_];
|
table_ = new T* [this->H_];
|
||||||
// fill with random values
|
// fill with random values
|
||||||
for(count_t j=0; j < this->H_; j++) {
|
for(count_t j=0; j < this->H_; j++) {
|
||||||
table_[j] = new T[tblLen_];
|
table_[j] = new T[tblLen_];
|
||||||
for(count_t i=0; i < tblLen_; i++) {
|
for(count_t i=0; i < tblLen_; i++) {
|
||||||
table_[j][i] = Utils::rand<T>(this->m_-1);
|
table_[j][i] = Utils::rand<T>(this->m_-1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void UnivHash_tableXOR<T>::freeSeeds() {
|
void UnivHash_tableXOR<T>::freeSeeds()
|
||||||
|
{
|
||||||
for(count_t j = 0; j < this->H_; j++)
|
for(count_t j = 0; j < this->H_; j++)
|
||||||
delete[] table_[j];
|
delete[] table_[j];
|
||||||
delete[] table_;
|
delete[] table_;
|
||||||
table_ = NULL;
|
table_ = NULL;
|
||||||
}
|
}
|
||||||
template <typename T>
|
template <typename T>
|
||||||
T UnivHash_tableXOR<T>::hash(const char* s, count_t h = 0) {
|
T UnivHash_tableXOR<T>::hash(const char* s, count_t h = 0)
|
||||||
|
{
|
||||||
T value = 0;
|
T value = 0;
|
||||||
count_t pos = 0, idx = 0;
|
count_t pos = 0, idx = 0;
|
||||||
unsigned char c;
|
unsigned char c;
|
||||||
while((c = *s++) && (++pos < MAX_STR_LEN))
|
while((c = *s++) && (++pos < MAX_STR_LEN))
|
||||||
value ^= table_[h][idx += c];
|
value ^= table_[h][idx += c];
|
||||||
CHECK(value < this->m_);
|
CHECK(value < this->m_);
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
// UnivHash_noPrimes
|
// UnivHash_noPrimes
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void UnivHash_noPrimes<T>::initSeeds() {
|
void UnivHash_noPrimes<T>::initSeeds()
|
||||||
|
{
|
||||||
a_ = new P[this->H_];
|
a_ = new P[this->H_];
|
||||||
for(T i=0; i < this->H_; i++) {
|
for(T i=0; i < this->H_; i++) {
|
||||||
a_[i] = Utils::rand<P>();
|
a_[i] = Utils::rand<P>();
|
||||||
@ -191,14 +218,16 @@ void UnivHash_noPrimes<T>::initSeeds() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
template <typename T>
|
template <typename T>
|
||||||
T UnivHash_noPrimes<T>::hash(const P x, count_t h=0) {
|
T UnivHash_noPrimes<T>::hash(const P x, count_t h=0)
|
||||||
|
{
|
||||||
// h_a(x) = (ax mod 2^l) div 2^(l-k)
|
// h_a(x) = (ax mod 2^l) div 2^(l-k)
|
||||||
T value = ((a_[h] * x) % p_) >> d_;
|
T value = ((a_[h] * x) % p_) >> d_;
|
||||||
return value % this->m_;
|
return value % this->m_;
|
||||||
}
|
}
|
||||||
template <typename T>
|
template <typename T>
|
||||||
T UnivHash_noPrimes<T>::hash(const wordID_t* id, const int len,
|
T UnivHash_noPrimes<T>::hash(const wordID_t* id, const int len,
|
||||||
count_t h=0) {
|
count_t h=0)
|
||||||
|
{
|
||||||
T value = 0;
|
T value = 0;
|
||||||
int pos(0);
|
int pos(0);
|
||||||
while(pos < len) {
|
while(pos < len) {
|
||||||
@ -208,39 +237,42 @@ T UnivHash_noPrimes<T>::hash(const wordID_t* id, const int len,
|
|||||||
return value % this->m_;
|
return value % this->m_;
|
||||||
}
|
}
|
||||||
template <typename T>
|
template <typename T>
|
||||||
T UnivHash_noPrimes<T>::hash(const char* s, count_t h=0) {
|
T UnivHash_noPrimes<T>::hash(const char* s, count_t h=0)
|
||||||
|
{
|
||||||
T value = 0;
|
T value = 0;
|
||||||
int pos(0);
|
int pos(0);
|
||||||
unsigned char c;
|
unsigned char c;
|
||||||
while((c = *s++) && (++pos < MAX_STR_LEN)) {
|
while((c = *s++) && (++pos < MAX_STR_LEN)) {
|
||||||
value ^= hash((P)c, h);
|
value ^= hash((P)c, h);
|
||||||
}
|
}
|
||||||
return value % this->m_;
|
return value % this->m_;
|
||||||
}
|
}
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void UnivHash_noPrimes<T>::save(FileHandler* fout) {
|
void UnivHash_noPrimes<T>::save(FileHandler* fout)
|
||||||
|
{
|
||||||
HashBase<T>::save(fout);
|
HashBase<T>::save(fout);
|
||||||
fout->write((char*)&p_, sizeof(p_));
|
fout->write((char*)&p_, sizeof(p_));
|
||||||
fout->write((char*)&d_, sizeof(d_));
|
fout->write((char*)&d_, sizeof(d_));
|
||||||
for(T i=0; i < this->H_; i++) {
|
for(T i=0; i < this->H_; i++) {
|
||||||
fout->write((char*)&a_[i], sizeof(a_[i]));
|
fout->write((char*)&a_[i], sizeof(a_[i]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void UnivHash_noPrimes<T>::load(FileHandler* fin) {
|
void UnivHash_noPrimes<T>::load(FileHandler* fin)
|
||||||
|
{
|
||||||
a_ = new P[this->H_];
|
a_ = new P[this->H_];
|
||||||
// HashBase<T>::load(fin) already done in constructor
|
// HashBase<T>::load(fin) already done in constructor
|
||||||
fin->read((char*)&p_, sizeof(p_));
|
fin->read((char*)&p_, sizeof(p_));
|
||||||
fin->read((char*)&d_, sizeof(d_));
|
fin->read((char*)&d_, sizeof(d_));
|
||||||
for(T i=0; i < this->H_; i++)
|
for(T i=0; i < this->H_; i++) {
|
||||||
{
|
|
||||||
fin->read((char*)&a_[i], sizeof(a_[i]));
|
fin->read((char*)&a_[i], sizeof(a_[i]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//UnivHash_linear
|
//UnivHash_linear
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void UnivHash_linear<T>::initSeeds() {
|
void UnivHash_linear<T>::initSeeds()
|
||||||
|
{
|
||||||
a_ = new T*[this->H_];
|
a_ = new T*[this->H_];
|
||||||
b_ = new T*[this->H_];
|
b_ = new T*[this->H_];
|
||||||
for(count_t i=0; i < this->H_; i++) {
|
for(count_t i=0; i < this->H_; i++) {
|
||||||
@ -253,7 +285,8 @@ void UnivHash_linear<T>::initSeeds() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void UnivHash_linear<T>::freeSeeds() {
|
void UnivHash_linear<T>::freeSeeds()
|
||||||
|
{
|
||||||
for(count_t i=0; i < this->H_; i++) {
|
for(count_t i=0; i < this->H_; i++) {
|
||||||
delete[] a_[i];
|
delete[] a_[i];
|
||||||
delete[] b_[i];
|
delete[] b_[i];
|
||||||
@ -263,8 +296,9 @@ void UnivHash_linear<T>::freeSeeds() {
|
|||||||
a_ = b_ = NULL;
|
a_ = b_ = NULL;
|
||||||
}
|
}
|
||||||
template <typename T>
|
template <typename T>
|
||||||
inline T UnivHash_linear<T>::hash(const wordID_t* id, const int len,
|
inline T UnivHash_linear<T>::hash(const wordID_t* id, const int len,
|
||||||
count_t h=0) {
|
count_t h=0)
|
||||||
|
{
|
||||||
CHECK(h < this->H_);
|
CHECK(h < this->H_);
|
||||||
T value = 0;
|
T value = 0;
|
||||||
int pos(0);
|
int pos(0);
|
||||||
@ -276,19 +310,21 @@ inline T UnivHash_linear<T>::hash(const wordID_t* id, const int len,
|
|||||||
}
|
}
|
||||||
template <typename T>
|
template <typename T>
|
||||||
inline T UnivHash_linear<T>::hash(const wordID_t id, const count_t pos,
|
inline T UnivHash_linear<T>::hash(const wordID_t id, const count_t pos,
|
||||||
const T prevValue, count_t h=0) {
|
const T prevValue, count_t h=0)
|
||||||
|
{
|
||||||
CHECK(h < this->H_);
|
CHECK(h < this->H_);
|
||||||
T value = prevValue + ((a_[h][pos] * id) + b_[h][pos]); // % pr_;
|
T value = prevValue + ((a_[h][pos] * id) + b_[h][pos]); // % pr_;
|
||||||
return value % this->m_;
|
return value % this->m_;
|
||||||
}
|
}
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void UnivHash_linear<T>::save(FileHandler* fout) {
|
void UnivHash_linear<T>::save(FileHandler* fout)
|
||||||
|
{
|
||||||
// int bytes = sizeof(a_[0][0]);
|
// int bytes = sizeof(a_[0][0]);
|
||||||
HashBase<T>::save(fout);
|
HashBase<T>::save(fout);
|
||||||
fout->write((char*)&pr_, sizeof(pr_));
|
fout->write((char*)&pr_, sizeof(pr_));
|
||||||
for(count_t i=0; i < this->H_; i++) {
|
for(count_t i=0; i < this->H_; i++) {
|
||||||
for(count_t j=0; j < MAX_NGRAM_ORDER; j++) {
|
for(count_t j=0; j < MAX_NGRAM_ORDER; j++) {
|
||||||
fout->write((char*)&a_[i][j], sizeof(a_[i][j]));
|
fout->write((char*)&a_[i][j], sizeof(a_[i][j]));
|
||||||
fout->write((char*)&b_[i][j], sizeof(b_[i][j]));
|
fout->write((char*)&b_[i][j], sizeof(b_[i][j]));
|
||||||
//cout << "a[" << i << "][" << j << "]=" << a_[i][j] << endl;
|
//cout << "a[" << i << "][" << j << "]=" << a_[i][j] << endl;
|
||||||
//cout << "b[" << i << "][" << j << "]=" << b_[i][j] << endl;
|
//cout << "b[" << i << "][" << j << "]=" << b_[i][j] << endl;
|
||||||
@ -296,7 +332,8 @@ void UnivHash_linear<T>::save(FileHandler* fout) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void UnivHash_linear<T>::load(FileHandler* fin) {
|
void UnivHash_linear<T>::load(FileHandler* fin)
|
||||||
|
{
|
||||||
// HashBase<T>::load(fin) already done in constructor
|
// HashBase<T>::load(fin) already done in constructor
|
||||||
fin->read((char*)&pr_, sizeof(pr_));
|
fin->read((char*)&pr_, sizeof(pr_));
|
||||||
a_ = new T*[this->H_];
|
a_ = new T*[this->H_];
|
||||||
@ -305,8 +342,8 @@ void UnivHash_linear<T>::load(FileHandler* fin) {
|
|||||||
a_[i] = new T[MAX_NGRAM_ORDER];
|
a_[i] = new T[MAX_NGRAM_ORDER];
|
||||||
b_[i] = new T[MAX_NGRAM_ORDER];
|
b_[i] = new T[MAX_NGRAM_ORDER];
|
||||||
for(count_t j=0; j < MAX_NGRAM_ORDER; j++) {
|
for(count_t j=0; j < MAX_NGRAM_ORDER; j++) {
|
||||||
fin->read((char*)&a_[i][j], sizeof(a_[i][j]));
|
fin->read((char*)&a_[i][j], sizeof(a_[i][j]));
|
||||||
fin->read((char*)&b_[i][j], sizeof(b_[i][j]));
|
fin->read((char*)&b_[i][j], sizeof(b_[i][j]));
|
||||||
//cout << "a[" << i << "][" << j << "]=" << a_[i][j] << endl;
|
//cout << "a[" << i << "][" << j << "]=" << a_[i][j] << endl;
|
||||||
//cout << "b[" << i << "][" << j << "]=" << b_[i][j] << endl;
|
//cout << "b[" << i << "][" << j << "]=" << b_[i][j] << endl;
|
||||||
}
|
}
|
||||||
|
@ -16,27 +16,28 @@ using randlm::Cache;
|
|||||||
const bool strict_checks_ = false;
|
const bool strict_checks_ = false;
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
class OnlineRLM: public PerfectHash<T> {
|
class OnlineRLM: public PerfectHash<T>
|
||||||
|
{
|
||||||
public:
|
public:
|
||||||
OnlineRLM(uint16_t MBs, int width, int bucketRange, count_t order,
|
OnlineRLM(uint16_t MBs, int width, int bucketRange, count_t order,
|
||||||
Vocab* v, float qBase = 8): PerfectHash<T>(MBs, width, bucketRange, qBase),
|
Vocab* v, float qBase = 8): PerfectHash<T>(MBs, width, bucketRange, qBase),
|
||||||
vocab_(v), bAdapting_(false), order_(order), corpusSize_(0), alpha_(0) {
|
vocab_(v), bAdapting_(false), order_(order), corpusSize_(0), alpha_(0) {
|
||||||
CHECK(vocab_ != 0);
|
CHECK(vocab_ != 0);
|
||||||
//instantiate quantizer class here
|
//instantiate quantizer class here
|
||||||
cache_ = new Cache<float>(8888.8888, 9999.9999); // unknown_value, null_value
|
cache_ = new Cache<float>(8888.8888, 9999.9999); // unknown_value, null_value
|
||||||
alpha_ = new float[order_ + 1];
|
alpha_ = new float[order_ + 1];
|
||||||
for(count_t i = 0; i <= order_; ++i)
|
for(count_t i = 0; i <= order_; ++i)
|
||||||
alpha_[i] = i * log10(0.4);
|
alpha_[i] = i * log10(0.4);
|
||||||
cerr << "Initialzing auxillary bit filters...\n";
|
cerr << "Initialzing auxillary bit filters...\n";
|
||||||
bPrefix_ = new BitFilter(this->cells_);
|
bPrefix_ = new BitFilter(this->cells_);
|
||||||
bHit_ = new BitFilter(this->cells_);
|
bHit_ = new BitFilter(this->cells_);
|
||||||
}
|
}
|
||||||
OnlineRLM(FileHandler* fin, count_t order):
|
OnlineRLM(FileHandler* fin, count_t order):
|
||||||
PerfectHash<T>(fin), bAdapting_(true), order_(order), corpusSize_(0) {
|
PerfectHash<T>(fin), bAdapting_(true), order_(order), corpusSize_(0) {
|
||||||
load(fin);
|
load(fin);
|
||||||
cache_ = new Cache<float>(8888.8888, 9999.9999); // unknown_value, null_value
|
cache_ = new Cache<float>(8888.8888, 9999.9999); // unknown_value, null_value
|
||||||
alpha_ = new float[order_ + 1];
|
alpha_ = new float[order_ + 1];
|
||||||
for(count_t i = 0; i <= order_; ++i)
|
for(count_t i = 0; i <= order_; ++i)
|
||||||
alpha_[i] = i * log10(0.4);
|
alpha_[i] = i * log10(0.4);
|
||||||
}
|
}
|
||||||
~OnlineRLM() {
|
~OnlineRLM() {
|
||||||
@ -52,14 +53,18 @@ public:
|
|||||||
bool insert(const std::vector<string>& ngram, const int value);
|
bool insert(const std::vector<string>& ngram, const int value);
|
||||||
bool update(const std::vector<string>& ngram, const int value);
|
bool update(const std::vector<string>& ngram, const int value);
|
||||||
int query(const wordID_t* IDs, const int len);
|
int query(const wordID_t* IDs, const int len);
|
||||||
int sbsqQuery(const std::vector<string>& ngram, int* len,
|
int sbsqQuery(const std::vector<string>& ngram, int* len,
|
||||||
bool bStrict = false);
|
bool bStrict = false);
|
||||||
int sbsqQuery(const wordID_t* IDs, const int len, int* codes,
|
int sbsqQuery(const wordID_t* IDs, const int len, int* codes,
|
||||||
bool bStrict = false);
|
bool bStrict = false);
|
||||||
void remove(const std::vector<string>& ngram);
|
void remove(const std::vector<string>& ngram);
|
||||||
count_t heurDelete(count_t num2del, count_t order = 5);
|
count_t heurDelete(count_t num2del, count_t order = 5);
|
||||||
uint64_t corpusSize() {return corpusSize_;}
|
uint64_t corpusSize() {
|
||||||
void corpusSize(uint64_t c) {corpusSize_ = c;}
|
return corpusSize_;
|
||||||
|
}
|
||||||
|
void corpusSize(uint64_t c) {
|
||||||
|
corpusSize_ = c;
|
||||||
|
}
|
||||||
void clearCache() {
|
void clearCache() {
|
||||||
if(cache_) cache_->clear();
|
if(cache_) cache_->clear();
|
||||||
}
|
}
|
||||||
@ -77,7 +82,7 @@ protected:
|
|||||||
void markQueried(hpdEntry_t& value);
|
void markQueried(hpdEntry_t& value);
|
||||||
bool markPrefix(const wordID_t* IDs, const int len, bool bSet);
|
bool markPrefix(const wordID_t* IDs, const int len, bool bSet);
|
||||||
private:
|
private:
|
||||||
const void* getContext(const wordID_t* ngram, int len);
|
const void* getContext(const wordID_t* ngram, int len);
|
||||||
const bool bAdapting_; // used to signal adaptation of model
|
const bool bAdapting_; // used to signal adaptation of model
|
||||||
const count_t order_; // LM order
|
const count_t order_; // LM order
|
||||||
uint64_t corpusSize_; // total training corpus size
|
uint64_t corpusSize_; // total training corpus size
|
||||||
@ -87,46 +92,48 @@ private:
|
|||||||
BitFilter* bHit_;
|
BitFilter* bHit_;
|
||||||
};
|
};
|
||||||
template<typename T>
|
template<typename T>
|
||||||
bool OnlineRLM<T>::insert(const std::vector<string>& ngram, const int value) {
|
bool OnlineRLM<T>::insert(const std::vector<string>& ngram, const int value)
|
||||||
|
{
|
||||||
int len = ngram.size();
|
int len = ngram.size();
|
||||||
wordID_t wrdIDs[len];
|
wordID_t wrdIDs[len];
|
||||||
uint64_t index(this->cells_ + 1);
|
uint64_t index(this->cells_ + 1);
|
||||||
for(int i = 0; i < len; ++i)
|
for(int i = 0; i < len; ++i)
|
||||||
wrdIDs[i] = vocab_->GetWordID(ngram[i]);
|
wrdIDs[i] = vocab_->GetWordID(ngram[i]);
|
||||||
index = PerfectHash<T>::insert(wrdIDs, len, value);
|
index = PerfectHash<T>::insert(wrdIDs, len, value);
|
||||||
if(value > 1 && len < order_)
|
if(value > 1 && len < order_)
|
||||||
markPrefix(wrdIDs, ngram.size(), true); // mark context
|
markPrefix(wrdIDs, ngram.size(), true); // mark context
|
||||||
// keep track of total items from training data minus "<s>"
|
// keep track of total items from training data minus "<s>"
|
||||||
if(ngram.size() == 1 && (!bAdapting_)) // hack to not change corpusSize when adapting
|
if(ngram.size() == 1 && (!bAdapting_)) // hack to not change corpusSize when adapting
|
||||||
corpusSize_ += (wrdIDs[0] != vocab_->GetBOSWordID()) ? value : 0;
|
corpusSize_ += (wrdIDs[0] != vocab_->GetBOSWordID()) ? value : 0;
|
||||||
if(bAdapting_ && (index < this->cells_)) // mark to keep while adapting
|
if(bAdapting_ && (index < this->cells_)) // mark to keep while adapting
|
||||||
markQueried(index);
|
markQueried(index);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
template<typename T>
|
template<typename T>
|
||||||
bool OnlineRLM<T>::update(const std::vector<string>& ngram, const int value) {
|
bool OnlineRLM<T>::update(const std::vector<string>& ngram, const int value)
|
||||||
|
{
|
||||||
int len = ngram.size();
|
int len = ngram.size();
|
||||||
wordID_t wrdIDs[len];
|
wordID_t wrdIDs[len];
|
||||||
uint64_t index(this->cells_ + 1);
|
uint64_t index(this->cells_ + 1);
|
||||||
hpdEntry_t hpdItr;
|
hpdEntry_t hpdItr;
|
||||||
vocab_->MakeOpen();
|
vocab_->MakeOpen();
|
||||||
for(int i = 0; i < len; ++i)
|
for(int i = 0; i < len; ++i)
|
||||||
wrdIDs[i] = vocab_->GetWordID(ngram[i]);
|
wrdIDs[i] = vocab_->GetWordID(ngram[i]);
|
||||||
// if updating, minimize false positives by pre-checking if context already in model
|
// if updating, minimize false positives by pre-checking if context already in model
|
||||||
bool bIncluded(true);
|
bool bIncluded(true);
|
||||||
if(value > 1 && len < (int)order_)
|
if(value > 1 && len < (int)order_)
|
||||||
bIncluded = markPrefix(wrdIDs, ngram.size(), true); // mark context
|
bIncluded = markPrefix(wrdIDs, ngram.size(), true); // mark context
|
||||||
if(bIncluded) { // if context found
|
if(bIncluded) { // if context found
|
||||||
bIncluded = PerfectHash<T>::update2(wrdIDs, len, value, hpdItr, index);
|
bIncluded = PerfectHash<T>::update2(wrdIDs, len, value, hpdItr, index);
|
||||||
if(index < this->cells_) {
|
if(index < this->cells_) {
|
||||||
markQueried(index);
|
markQueried(index);
|
||||||
}
|
} else if(hpdItr != this->dict_.end()) markQueried(hpdItr);
|
||||||
else if(hpdItr != this->dict_.end()) markQueried(hpdItr);
|
|
||||||
}
|
}
|
||||||
return bIncluded;
|
return bIncluded;
|
||||||
}
|
}
|
||||||
template<typename T>
|
template<typename T>
|
||||||
int OnlineRLM<T>::query(const wordID_t* IDs, int len) {
|
int OnlineRLM<T>::query(const wordID_t* IDs, int len)
|
||||||
|
{
|
||||||
uint64_t filterIdx = 0;
|
uint64_t filterIdx = 0;
|
||||||
hpdEntry_t hpdItr;
|
hpdEntry_t hpdItr;
|
||||||
int value(0);
|
int value(0);
|
||||||
@ -135,8 +142,7 @@ int OnlineRLM<T>::query(const wordID_t* IDs, int len) {
|
|||||||
if(hpdItr != this->dict_.end()) {
|
if(hpdItr != this->dict_.end()) {
|
||||||
//markQueried(hpdItr); // mark this event as "hit"
|
//markQueried(hpdItr); // mark this event as "hit"
|
||||||
value -= ((value & this->hitMask_) != 0) ? this->hitMask_ : 0; // check for previous hit marks
|
value -= ((value & this->hitMask_) != 0) ? this->hitMask_ : 0; // check for previous hit marks
|
||||||
}
|
} else {
|
||||||
else {
|
|
||||||
CHECK(filterIdx < this->cells_);
|
CHECK(filterIdx < this->cells_);
|
||||||
//markQueried(filterIdx);
|
//markQueried(filterIdx);
|
||||||
}
|
}
|
||||||
@ -144,15 +150,16 @@ int OnlineRLM<T>::query(const wordID_t* IDs, int len) {
|
|||||||
return value > 0 ? value : 0;
|
return value > 0 ? value : 0;
|
||||||
}
|
}
|
||||||
template<typename T>
|
template<typename T>
|
||||||
bool OnlineRLM<T>::markPrefix(const wordID_t* IDs, const int len, bool bSet) {
|
bool OnlineRLM<T>::markPrefix(const wordID_t* IDs, const int len, bool bSet)
|
||||||
if(len <= 1) return true; // only do this for for ngrams with context
|
{
|
||||||
static Cache<int> pfCache(-1, -1); // local prefix cache
|
if(len <= 1) return true; // only do this for for ngrams with context
|
||||||
|
static Cache<int> pfCache(-1, -1); // local prefix cache
|
||||||
int code(0);
|
int code(0);
|
||||||
if(!pfCache.checkCacheNgram(IDs, len - 1, &code, NULL)) {
|
if(!pfCache.checkCacheNgram(IDs, len - 1, &code, NULL)) {
|
||||||
hpdEntry_t hpdItr;
|
hpdEntry_t hpdItr;
|
||||||
uint64_t filterIndex(0);
|
uint64_t filterIndex(0);
|
||||||
code = PerfectHash<T>::query(IDs, len - 1, hpdItr, filterIndex); // hash IDs[0..len-1]
|
code = PerfectHash<T>::query(IDs, len - 1, hpdItr, filterIndex); // hash IDs[0..len-1]
|
||||||
if(code == -1) { // encountered false positive in pipeline
|
if(code == -1) { // encountered false positive in pipeline
|
||||||
cerr << "WARNING: markPrefix(). The O-RLM is *not* well-formed.\n";
|
cerr << "WARNING: markPrefix(). The O-RLM is *not* well-formed.\n";
|
||||||
// add all prefixes or return false;
|
// add all prefixes or return false;
|
||||||
return false;
|
return false;
|
||||||
@ -161,10 +168,9 @@ bool OnlineRLM<T>::markPrefix(const wordID_t* IDs, const int len, bool bSet) {
|
|||||||
CHECK(hpdItr == this->dict_.end());
|
CHECK(hpdItr == this->dict_.end());
|
||||||
if(bSet) bPrefix_->setBit(filterIndex); // mark index
|
if(bSet) bPrefix_->setBit(filterIndex); // mark index
|
||||||
else bPrefix_->clearBit(filterIndex); // unset index
|
else bPrefix_->clearBit(filterIndex); // unset index
|
||||||
}
|
} else {
|
||||||
else {
|
|
||||||
CHECK(filterIndex == this->cells_ + 1);
|
CHECK(filterIndex == this->cells_ + 1);
|
||||||
//how to handle hpd prefixes?
|
//how to handle hpd prefixes?
|
||||||
}
|
}
|
||||||
if(pfCache.nodes() > 10000) pfCache.clear();
|
if(pfCache.nodes() > 10000) pfCache.clear();
|
||||||
pfCache.setCacheNgram(IDs, len - 1, code, NULL);
|
pfCache.setCacheNgram(IDs, len - 1, code, NULL);
|
||||||
@ -172,36 +178,40 @@ bool OnlineRLM<T>::markPrefix(const wordID_t* IDs, const int len, bool bSet) {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
template<typename T>
|
template<typename T>
|
||||||
void OnlineRLM<T>::markQueried(const uint64_t& index) {
|
void OnlineRLM<T>::markQueried(const uint64_t& index)
|
||||||
|
{
|
||||||
bHit_->setBit(index);
|
bHit_->setBit(index);
|
||||||
//cerr << "filter[" << index << "] = " << this->filter_->read(index) << endl;
|
//cerr << "filter[" << index << "] = " << this->filter_->read(index) << endl;
|
||||||
}
|
}
|
||||||
template<typename T>
|
template<typename T>
|
||||||
void OnlineRLM<T>::markQueried(hpdEntry_t& value) {
|
void OnlineRLM<T>::markQueried(hpdEntry_t& value)
|
||||||
// set high bit of counter to indicate "hit" status
|
{
|
||||||
|
// set high bit of counter to indicate "hit" status
|
||||||
value->second |= this->hitMask_;
|
value->second |= this->hitMask_;
|
||||||
}
|
}
|
||||||
template<typename T>
|
template<typename T>
|
||||||
void OnlineRLM<T>::remove(const std::vector<string>& ngram) {
|
void OnlineRLM<T>::remove(const std::vector<string>& ngram)
|
||||||
|
{
|
||||||
wordID_t IDs[ngram.size()];
|
wordID_t IDs[ngram.size()];
|
||||||
for(count_t i = 0; i < ngram.size(); ++i)
|
for(count_t i = 0; i < ngram.size(); ++i)
|
||||||
IDs[i] = vocab_->GetWordID(ngram[i]);
|
IDs[i] = vocab_->GetWordID(ngram[i]);
|
||||||
PerfectHash<T>::remove(IDs, ngram.size());
|
PerfectHash<T>::remove(IDs, ngram.size());
|
||||||
}
|
}
|
||||||
template<typename T>
|
template<typename T>
|
||||||
count_t OnlineRLM<T>::heurDelete(count_t num2del, count_t order) {
|
count_t OnlineRLM<T>::heurDelete(count_t num2del, count_t order)
|
||||||
|
{
|
||||||
count_t deleted = 0;
|
count_t deleted = 0;
|
||||||
cout << "Deleting " << num2del << " of order "<< order << endl;
|
cout << "Deleting " << num2del << " of order "<< order << endl;
|
||||||
// delete from filter first
|
// delete from filter first
|
||||||
int full = *std::max_element(this->idxTracker_, this->idxTracker_
|
int full = *std::max_element(this->idxTracker_, this->idxTracker_
|
||||||
+ this->totBuckets_);
|
+ this->totBuckets_);
|
||||||
for(; full > 0; --full) // delete from fullest buckets first
|
for(; full > 0; --full) // delete from fullest buckets first
|
||||||
for(int bk = 0; bk < this->totBuckets_; ++bk) {
|
for(int bk = 0; bk < this->totBuckets_; ++bk) {
|
||||||
if(deleted >= num2del) break;
|
if(deleted >= num2del) break;
|
||||||
if(this->idxTracker_[bk] == full) { // if full
|
if(this->idxTracker_[bk] == full) { // if full
|
||||||
uint64_t first = bk * this->bucketRange_,
|
uint64_t first = bk * this->bucketRange_,
|
||||||
last = first + this->bucketRange_;
|
last = first + this->bucketRange_;
|
||||||
for(uint64_t row = first; row < last; ++row) { // check each row
|
for(uint64_t row = first; row < last; ++row) { // check each row
|
||||||
if(!(bHit_->testBit(row) || bPrefix_->testBit(row) )) {
|
if(!(bHit_->testBit(row) || bPrefix_->testBit(row) )) {
|
||||||
if(this->filter_->read(row) != 0) {
|
if(this->filter_->read(row) != 0) {
|
||||||
PerfectHash<T>::remove(row); // remove from filter
|
PerfectHash<T>::remove(row); // remove from filter
|
||||||
@ -220,15 +230,17 @@ count_t OnlineRLM<T>::heurDelete(count_t num2del, count_t order) {
|
|||||||
}
|
}
|
||||||
template<typename T>
|
template<typename T>
|
||||||
int OnlineRLM<T>::sbsqQuery(const std::vector<string>& ngram, int* codes,
|
int OnlineRLM<T>::sbsqQuery(const std::vector<string>& ngram, int* codes,
|
||||||
bool bStrict) {
|
bool bStrict)
|
||||||
|
{
|
||||||
wordID_t IDs[ngram.size()];
|
wordID_t IDs[ngram.size()];
|
||||||
for(count_t i = 0; i < ngram.size(); ++i)
|
for(count_t i = 0; i < ngram.size(); ++i)
|
||||||
IDs[i] = vocab_->GetWordID(ngram[i]);
|
IDs[i] = vocab_->GetWordID(ngram[i]);
|
||||||
return sbsqQuery(IDs, ngram.size(), codes, bStrict);
|
return sbsqQuery(IDs, ngram.size(), codes, bStrict);
|
||||||
}
|
}
|
||||||
template<typename T>
|
template<typename T>
|
||||||
int OnlineRLM<T>::sbsqQuery(const wordID_t* IDs, const int len, int* codes,
|
int OnlineRLM<T>::sbsqQuery(const wordID_t* IDs, const int len, int* codes,
|
||||||
bool bStrict) {
|
bool bStrict)
|
||||||
|
{
|
||||||
uint64_t filterIdx = 0;
|
uint64_t filterIdx = 0;
|
||||||
int val(0), fnd(0);
|
int val(0), fnd(0);
|
||||||
hpdEntry_t hpdItr;
|
hpdEntry_t hpdItr;
|
||||||
@ -240,14 +252,13 @@ int OnlineRLM<T>::sbsqQuery(const wordID_t* IDs, const int len, int* codes,
|
|||||||
if(hpdItr != this->dict_.end()) {
|
if(hpdItr != this->dict_.end()) {
|
||||||
val -= ((val & this->hitMask_) != 0) ? this->hitMask_ : 0; // account for previous hit marks
|
val -= ((val & this->hitMask_) != 0) ? this->hitMask_ : 0; // account for previous hit marks
|
||||||
}
|
}
|
||||||
}
|
} else if(bStrict) {
|
||||||
else if(bStrict) {
|
break;
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
// add to value array
|
// add to value array
|
||||||
codes[i] = val > 0 ? val : 0;
|
codes[i] = val > 0 ? val : 0;
|
||||||
}
|
}
|
||||||
while(bStrict && (fnd > 1)) { // do checks the other way
|
while(bStrict && (fnd > 1)) { // do checks the other way
|
||||||
val = PerfectHash<T>::query(&IDs[len - fnd], fnd - 1, hpdItr, filterIdx);
|
val = PerfectHash<T>::query(&IDs[len - fnd], fnd - 1, hpdItr, filterIdx);
|
||||||
if(val != -1) break; // if anything found
|
if(val != -1) break; // if anything found
|
||||||
else --fnd; // else decrement found
|
else --fnd; // else decrement found
|
||||||
@ -255,8 +266,9 @@ int OnlineRLM<T>::sbsqQuery(const wordID_t* IDs, const int len, int* codes,
|
|||||||
return fnd;
|
return fnd;
|
||||||
}
|
}
|
||||||
template<typename T>
|
template<typename T>
|
||||||
float OnlineRLM<T>::getProb(const wordID_t* ngram, int len,
|
float OnlineRLM<T>::getProb(const wordID_t* ngram, int len,
|
||||||
const void** state) {
|
const void** state)
|
||||||
|
{
|
||||||
static const float oovprob = log10(1.0 / (static_cast<float>(vocab_->Size()) - 1));
|
static const float oovprob = log10(1.0 / (static_cast<float>(vocab_->Size()) - 1));
|
||||||
float logprob(0);
|
float logprob(0);
|
||||||
const void* context = (state) ? *state : 0;
|
const void* context = (state) ? *state : 0;
|
||||||
@ -264,61 +276,61 @@ float OnlineRLM<T>::getProb(const wordID_t* ngram, int len,
|
|||||||
if(!cache_->checkCacheNgram(ngram, len, &logprob, &context)) {
|
if(!cache_->checkCacheNgram(ngram, len, &logprob, &context)) {
|
||||||
// get full prob and put in cache
|
// get full prob and put in cache
|
||||||
int num_fnd(0), den_val(0);
|
int num_fnd(0), den_val(0);
|
||||||
int in[len]; // in[] keeps counts of increasing order numerator
|
int in[len]; // in[] keeps counts of increasing order numerator
|
||||||
for(int i = 0; i < len; ++i) in[i] = 0;
|
for(int i = 0; i < len; ++i) in[i] = 0;
|
||||||
for(int i = len - 1; i >= 0; --i) {
|
for(int i = len - 1; i >= 0; --i) {
|
||||||
if(ngram[i] == vocab_->GetkOOVWordID()) break; // no need to query if OOV
|
if(ngram[i] == vocab_->GetkOOVWordID()) break; // no need to query if OOV
|
||||||
in[i] = query(&ngram[i], len - i);
|
in[i] = query(&ngram[i], len - i);
|
||||||
if(in[i] > 0) {
|
if(in[i] > 0) {
|
||||||
num_fnd = len - i;
|
num_fnd = len - i;
|
||||||
}
|
} else if(strict_checks_) break;
|
||||||
else if(strict_checks_) break;
|
|
||||||
}
|
}
|
||||||
while(num_fnd > 1) { // get lower order count
|
while(num_fnd > 1) { // get lower order count
|
||||||
//get sub-context of size one less than length found (exluding target)
|
//get sub-context of size one less than length found (exluding target)
|
||||||
if(((den_val = query(&ngram[len - num_fnd], num_fnd - 1)) > 0) &&
|
if(((den_val = query(&ngram[len - num_fnd], num_fnd - 1)) > 0) &&
|
||||||
(den_val >= in[len - num_fnd]) && (in[len - num_fnd] > 0)) {
|
(den_val >= in[len - num_fnd]) && (in[len - num_fnd] > 0)) {
|
||||||
break;
|
break;
|
||||||
}
|
} else --num_fnd; // else backoff to lower ngram order
|
||||||
else --num_fnd; // else backoff to lower ngram order
|
|
||||||
}
|
}
|
||||||
if(num_fnd == 1 && (in[len - 1] < 1)) // sanity check for unigrams
|
if(num_fnd == 1 && (in[len - 1] < 1)) // sanity check for unigrams
|
||||||
num_fnd = 0;
|
num_fnd = 0;
|
||||||
switch(num_fnd) { // find prob (need to refactor into precomputation)
|
switch(num_fnd) { // find prob (need to refactor into precomputation)
|
||||||
case 0: // OOV
|
case 0: // OOV
|
||||||
logprob = alpha_[len] + oovprob;
|
logprob = alpha_[len] + oovprob;
|
||||||
break;
|
break;
|
||||||
case 1: // unigram found only
|
case 1: // unigram found only
|
||||||
CHECK(in[len - 1] > 0);
|
CHECK(in[len - 1] > 0);
|
||||||
logprob = alpha_[len - 1] + (corpusSize_ > 0 ?
|
logprob = alpha_[len - 1] + (corpusSize_ > 0 ?
|
||||||
log10(static_cast<float>(in[len - 1]) / static_cast<float>(corpusSize_)) : 0);
|
log10(static_cast<float>(in[len - 1]) / static_cast<float>(corpusSize_)) : 0);
|
||||||
//logprob = alpha_[len - 1] +
|
//logprob = alpha_[len - 1] +
|
||||||
//log10(static_cast<float>(in[len - 1]) / static_cast<float>(corpusSize_));
|
//log10(static_cast<float>(in[len - 1]) / static_cast<float>(corpusSize_));
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
CHECK(den_val > 0);
|
CHECK(den_val > 0);
|
||||||
//if(subgram == in[len - found]) ++subgram; // avoid returning zero probs????
|
//if(subgram == in[len - found]) ++subgram; // avoid returning zero probs????
|
||||||
logprob = alpha_[len - num_fnd] +
|
logprob = alpha_[len - num_fnd] +
|
||||||
log10(static_cast<float>(in[len - num_fnd]) / static_cast<float>(den_val));
|
log10(static_cast<float>(in[len - num_fnd]) / static_cast<float>(den_val));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// need unique context
|
// need unique context
|
||||||
context = getContext(&ngram[len - num_fnd], num_fnd);
|
context = getContext(&ngram[len - num_fnd], num_fnd);
|
||||||
// put whatever was found in cache
|
// put whatever was found in cache
|
||||||
cache_->setCacheNgram(ngram, len, logprob, context);
|
cache_->setCacheNgram(ngram, len, logprob, context);
|
||||||
} // end checkCache
|
} // end checkCache
|
||||||
return logprob;
|
return logprob;
|
||||||
}
|
}
|
||||||
template<typename T>
|
template<typename T>
|
||||||
const void* OnlineRLM<T>::getContext(const wordID_t* ngram, int len) {
|
const void* OnlineRLM<T>::getContext(const wordID_t* ngram, int len)
|
||||||
|
{
|
||||||
int dummy(0);
|
int dummy(0);
|
||||||
float* addresses[len]; // only interested in addresses of cache
|
float* addresses[len]; // only interested in addresses of cache
|
||||||
CHECK(cache_->getCache2(ngram, len, &addresses[0], &dummy) == len);
|
CHECK(cache_->getCache2(ngram, len, &addresses[0], &dummy) == len);
|
||||||
// return address of cache node
|
// return address of cache node
|
||||||
return (const void*)addresses[0];
|
return (const void*)addresses[0];
|
||||||
}
|
}
|
||||||
template<typename T>
|
template<typename T>
|
||||||
void OnlineRLM<T>::randDelete(int num2del) {
|
void OnlineRLM<T>::randDelete(int num2del)
|
||||||
|
{
|
||||||
int deleted = 0;
|
int deleted = 0;
|
||||||
for(uint64_t i = 0; i < this->cells_; i++) {
|
for(uint64_t i = 0; i < this->cells_; i++) {
|
||||||
if(this->filter_->read(i) != 0) {
|
if(this->filter_->read(i) != 0) {
|
||||||
@ -329,18 +341,20 @@ void OnlineRLM<T>::randDelete(int num2del) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<typename T>
|
template<typename T>
|
||||||
int OnlineRLM<T>::countHits() {
|
int OnlineRLM<T>::countHits()
|
||||||
|
{
|
||||||
int hit(0);
|
int hit(0);
|
||||||
for(uint64_t i = 0; i < this->cells_; ++i)
|
for(uint64_t i = 0; i < this->cells_; ++i)
|
||||||
if(bHit_->testBit(i)) ++hit;
|
if(bHit_->testBit(i)) ++hit;
|
||||||
iterate(this->dict_, itr)
|
iterate(this->dict_, itr)
|
||||||
if((itr->second & this->hitMask_) != 0)
|
if((itr->second & this->hitMask_) != 0)
|
||||||
++hit;
|
++hit;
|
||||||
cerr << "Hit count = " << hit << endl;
|
cerr << "Hit count = " << hit << endl;
|
||||||
return hit;
|
return hit;
|
||||||
}
|
}
|
||||||
template<typename T>
|
template<typename T>
|
||||||
int OnlineRLM<T>::countPrefixes() {
|
int OnlineRLM<T>::countPrefixes()
|
||||||
|
{
|
||||||
int pfx(0);
|
int pfx(0);
|
||||||
for(uint64_t i = 0; i < this->cells_; ++i)
|
for(uint64_t i = 0; i < this->cells_; ++i)
|
||||||
if(bPrefix_->testBit(i)) ++pfx;
|
if(bPrefix_->testBit(i)) ++pfx;
|
||||||
@ -349,22 +363,24 @@ int OnlineRLM<T>::countPrefixes() {
|
|||||||
return pfx;
|
return pfx;
|
||||||
}
|
}
|
||||||
template<typename T>
|
template<typename T>
|
||||||
int OnlineRLM<T>::cleanUpHPD() {
|
int OnlineRLM<T>::cleanUpHPD()
|
||||||
|
{
|
||||||
cerr << "HPD size before = " << this->dict_.size() << endl;
|
cerr << "HPD size before = " << this->dict_.size() << endl;
|
||||||
std::vector<string> vDel, vtmp;
|
std::vector<string> vDel, vtmp;
|
||||||
iterate(this->dict_, itr) {
|
iterate(this->dict_, itr) {
|
||||||
if(((itr->second & this->hitMask_) == 0) && // if not hit during testing
|
if(((itr->second & this->hitMask_) == 0) && // if not hit during testing
|
||||||
(Utils::splitToStr(itr->first, vtmp, "¬") >= 3)) { // and higher order ngram
|
(Utils::splitToStr(itr->first, vtmp, "¬") >= 3)) { // and higher order ngram
|
||||||
vDel.push_back(itr->first);
|
vDel.push_back(itr->first);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
iterate(vDel, vitr)
|
iterate(vDel, vitr)
|
||||||
this->dict_.erase(*vitr);
|
this->dict_.erase(*vitr);
|
||||||
cerr << "HPD size after = " << this->dict_.size() << endl;
|
cerr << "HPD size after = " << this->dict_.size() << endl;
|
||||||
return vDel.size();
|
return vDel.size();
|
||||||
}
|
}
|
||||||
template<typename T>
|
template<typename T>
|
||||||
void OnlineRLM<T>::clearMarkings() {
|
void OnlineRLM<T>::clearMarkings()
|
||||||
|
{
|
||||||
cerr << "clearing all event hits\n";
|
cerr << "clearing all event hits\n";
|
||||||
bHit_->reset();
|
bHit_->reset();
|
||||||
count_t* value(0);
|
count_t* value(0);
|
||||||
@ -374,7 +390,8 @@ void OnlineRLM<T>::clearMarkings() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<typename T>
|
template<typename T>
|
||||||
void OnlineRLM<T>::save(FileHandler* fout) {
|
void OnlineRLM<T>::save(FileHandler* fout)
|
||||||
|
{
|
||||||
cerr << "Saving ORLM...\n";
|
cerr << "Saving ORLM...\n";
|
||||||
// save vocab
|
// save vocab
|
||||||
vocab_->Save(fout);
|
vocab_->Save(fout);
|
||||||
@ -387,7 +404,8 @@ void OnlineRLM<T>::save(FileHandler* fout) {
|
|||||||
cerr << "Finished saving ORLM." << endl;
|
cerr << "Finished saving ORLM." << endl;
|
||||||
}
|
}
|
||||||
template<typename T>
|
template<typename T>
|
||||||
void OnlineRLM<T>::load(FileHandler* fin) {
|
void OnlineRLM<T>::load(FileHandler* fin)
|
||||||
|
{
|
||||||
cerr << "Loading ORLM...\n";
|
cerr << "Loading ORLM...\n";
|
||||||
// load vocab first
|
// load vocab first
|
||||||
vocab_ = new Vocab(fin);
|
vocab_ = new Vocab(fin);
|
||||||
@ -402,12 +420,13 @@ void OnlineRLM<T>::load(FileHandler* fin) {
|
|||||||
PerfectHash<T>::load(fin);
|
PerfectHash<T>::load(fin);
|
||||||
}
|
}
|
||||||
template<typename T>
|
template<typename T>
|
||||||
void OnlineRLM<T>::removeNonMarked() {
|
void OnlineRLM<T>::removeNonMarked()
|
||||||
|
{
|
||||||
cerr << "deleting all unused events\n";
|
cerr << "deleting all unused events\n";
|
||||||
int deleted(0);
|
int deleted(0);
|
||||||
for(uint64_t i = 0; i < this->cells_; ++i) {
|
for(uint64_t i = 0; i < this->cells_; ++i) {
|
||||||
if(!(bHit_->testBit(i) || bPrefix_->testBit(i))
|
if(!(bHit_->testBit(i) || bPrefix_->testBit(i))
|
||||||
&& (this->filter_->read(i) != 0)) {
|
&& (this->filter_->read(i) != 0)) {
|
||||||
PerfectHash<T>::remove(i);
|
PerfectHash<T>::remove(i);
|
||||||
++deleted;
|
++deleted;
|
||||||
}
|
}
|
||||||
@ -429,36 +448,36 @@ float OnlineRLM<T>::getProb2(const wordID_t* ngram, int len, const void** state)
|
|||||||
// constrain cache queries using model assumptions
|
// constrain cache queries using model assumptions
|
||||||
int denom_len = cache_->getCache(ngram, len - 1, &denom_codes[0], &denom_found);
|
int denom_len = cache_->getCache(ngram, len - 1, &denom_codes[0], &denom_found);
|
||||||
cerr << "denom_len = " << denom_len << endl;
|
cerr << "denom_len = " << denom_len << endl;
|
||||||
int num_len = cache_->getCache(&ngram[len - denom_len - 1], denom_len + 1,
|
int num_len = cache_->getCache(&ngram[len - denom_len - 1], denom_len + 1,
|
||||||
&num_codes[0], &found);
|
&num_codes[0], &found);
|
||||||
cerr << "num_len= " << num_len << endl;
|
cerr << "num_len= " << num_len << endl;
|
||||||
// keed reducing ngram size until both denominator and numerator are found
|
// keed reducing ngram size until both denominator and numerator are found
|
||||||
// allowed to leave kUnknownCode in cache because we check for this.
|
// allowed to leave kUnknownCode in cache because we check for this.
|
||||||
found = num_len; // guaranteed to be <= denom_len + 1
|
found = num_len; // guaranteed to be <= denom_len + 1
|
||||||
// still check for OOV
|
// still check for OOV
|
||||||
for (int i = len - found; i < len; ++i)
|
for (int i = len - found; i < len; ++i)
|
||||||
if (ngram[i] == Vocab::kOOVWordID) {
|
if (ngram[i] == Vocab::kOOVWordID) {
|
||||||
found = len - i - 1;
|
found = len - i - 1;
|
||||||
}
|
}
|
||||||
// check for relative estimator
|
// check for relative estimator
|
||||||
while(found > 1) {
|
while(found > 1) {
|
||||||
if(*denom_codes[found-1] == cache_unk_ &&
|
if(*denom_codes[found-1] == cache_unk_ &&
|
||||||
((*denom_codes[found-1] = query(&ngram[len-found], found-1)) == 0)) {
|
((*denom_codes[found-1] = query(&ngram[len-found], found-1)) == 0)) {
|
||||||
//!struct_->query(&ngram[len-*found], *found-1, kMainEventIdx, denom_codes[*found-1])) {
|
//!struct_->query(&ngram[len-*found], *found-1, kMainEventIdx, denom_codes[*found-1])) {
|
||||||
*num_codes[found] = cache_unk_;
|
*num_codes[found] = cache_unk_;
|
||||||
} else {
|
} else {
|
||||||
if(*num_codes[found] != cache_unk_ ||
|
if(*num_codes[found] != cache_unk_ ||
|
||||||
((*num_codes[found] = query(&ngram[len-found], found)) <= *denom_codes[found-1]))
|
((*num_codes[found] = query(&ngram[len-found], found)) <= *denom_codes[found-1]))
|
||||||
// struct_->query(&ngram[len-*found], *found, kMainEventIdx,
|
// struct_->query(&ngram[len-*found], *found, kMainEventIdx,
|
||||||
// num_codes[*found], *denom_codes[*found-1]))
|
// num_codes[*found], *denom_codes[*found-1]))
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
--found;
|
--found;
|
||||||
}
|
}
|
||||||
// didn't find bigram numerator or unigram denominator
|
// didn't find bigram numerator or unigram denominator
|
||||||
if (found == 1)
|
if (found == 1)
|
||||||
found = *num_codes[1] != cache_unk_
|
found = *num_codes[1] != cache_unk_
|
||||||
|| ((*num_codes[1] = query(&ngram[len - 1], 1)) != 0);
|
|| ((*num_codes[1] = query(&ngram[len - 1], 1)) != 0);
|
||||||
//struct_->query(&ngram[len - 1], 1, kMainEventIdx, num_codes[1]);
|
//struct_->query(&ngram[len - 1], 1, kMainEventIdx, num_codes[1]);
|
||||||
// ....
|
// ....
|
||||||
// return estimate applying correct backoff score (precomputed)
|
// return estimate applying correct backoff score (precomputed)
|
||||||
@ -469,20 +488,20 @@ float OnlineRLM<T>::getProb2(const wordID_t* ngram, int len, const void** state)
|
|||||||
//log_prob = stupid_backoff_log10_[len] + uniform_log10prob_;
|
//log_prob = stupid_backoff_log10_[len] + uniform_log10prob_;
|
||||||
break;
|
break;
|
||||||
case 1: // unigram over whole corpus
|
case 1: // unigram over whole corpus
|
||||||
log_prob = alpha_[len - 1] +
|
log_prob = alpha_[len - 1] +
|
||||||
log10(static_cast<float>(*num_codes[1]) / static_cast<float>(corpusSize_));
|
log10(static_cast<float>(*num_codes[1]) / static_cast<float>(corpusSize_));
|
||||||
//log_prob = log_quantiser_->getLog10Value(*num_codes[1]) - corpus_size_log10_
|
//log_prob = log_quantiser_->getLog10Value(*num_codes[1]) - corpus_size_log10_
|
||||||
// + stupid_backoff_log10_[len - 1]; // precomputed
|
// + stupid_backoff_log10_[len - 1]; // precomputed
|
||||||
break;
|
break;
|
||||||
default: // otherwise use both statistics and (possibly zero) backoff weight
|
default: // otherwise use both statistics and (possibly zero) backoff weight
|
||||||
log_prob = alpha_[len - found] +
|
log_prob = alpha_[len - found] +
|
||||||
log10(static_cast<float>(*num_codes[found]) / static_cast<float>(*denom_codes[found-1]));
|
log10(static_cast<float>(*num_codes[found]) / static_cast<float>(*denom_codes[found-1]));
|
||||||
//log_prob = log_quantiser_->getLog10Value(*num_codes[*found ])
|
//log_prob = log_quantiser_->getLog10Value(*num_codes[*found ])
|
||||||
// - log_quantiser_->getLog10Value(*denom_codes[*found - 1])
|
// - log_quantiser_->getLog10Value(*denom_codes[*found - 1])
|
||||||
// + stupid_backoff_log10_[len - *found];
|
// + stupid_backoff_log10_[len - *found];
|
||||||
}
|
}
|
||||||
context_state = (const void*)num_codes[found == len ? found - 1 : found];;
|
context_state = (const void*)num_codes[found == len ? found - 1 : found];;
|
||||||
//probCache_->store(len, log_prob, context_state);
|
//probCache_->store(len, log_prob, context_state);
|
||||||
if (state)
|
if (state)
|
||||||
*state = context_state;
|
*state = context_state;
|
||||||
return log_prob;
|
return log_prob;
|
||||||
|
@ -1,10 +1,11 @@
|
|||||||
#include "params.h"
|
#include "params.h"
|
||||||
|
|
||||||
namespace Moses {
|
namespace Moses
|
||||||
|
{
|
||||||
// parameter constants
|
// parameter constants
|
||||||
const std::string Parameters::kNotSetValue = "__NOT_SET__";
|
const std::string Parameters::kNotSetValue = "__NOT_SET__";
|
||||||
|
|
||||||
const int Parameters::kBoolValue = 0;
|
const int Parameters::kBoolValue = 0;
|
||||||
const int Parameters::kIntValue = 1;
|
const int Parameters::kIntValue = 1;
|
||||||
const int Parameters::kFloatValue = 2;
|
const int Parameters::kFloatValue = 2;
|
||||||
const int Parameters::kStringValue = 3;
|
const int Parameters::kStringValue = 3;
|
||||||
@ -13,26 +14,30 @@ const int Parameters::kUndefinedValue = -1;
|
|||||||
const std::string Parameters::kTrueValue = "1";
|
const std::string Parameters::kTrueValue = "1";
|
||||||
const std::string Parameters::kFalseValue = "0";
|
const std::string Parameters::kFalseValue = "0";
|
||||||
|
|
||||||
Parameters::Parameters(const ParamDefs * paramdefs, const count_t paramNum) {
|
Parameters::Parameters(const ParamDefs * paramdefs, const count_t paramNum)
|
||||||
|
{
|
||||||
initialize(paramdefs, paramNum);
|
initialize(paramdefs, paramNum);
|
||||||
}
|
}
|
||||||
|
|
||||||
Parameters::Parameters(int argc, char ** argv, const ParamDefs * paramdefs,
|
Parameters::Parameters(int argc, char ** argv, const ParamDefs * paramdefs,
|
||||||
const count_t paramNum) {
|
const count_t paramNum)
|
||||||
|
{
|
||||||
initialize(paramdefs, paramNum);
|
initialize(paramdefs, paramNum);
|
||||||
loadParams(argc, argv);
|
loadParams(argc, argv);
|
||||||
}
|
}
|
||||||
|
|
||||||
void Parameters::initialize(const ParamDefs * paramdefs, const count_t paramNum) {
|
void Parameters::initialize(const ParamDefs * paramdefs, const count_t paramNum)
|
||||||
|
{
|
||||||
for( count_t i = 0; i < paramNum; i++ ) {
|
for( count_t i = 0; i < paramNum; i++ ) {
|
||||||
params_[paramdefs[i].name] = paramdefs[i]; // assign name
|
params_[paramdefs[i].name] = paramdefs[i]; // assign name
|
||||||
}
|
}
|
||||||
cerr << "Default parameter values:\n";
|
cerr << "Default parameter values:\n";
|
||||||
iterate(params_, itr)
|
iterate(params_, itr)
|
||||||
cerr << "\t" << itr->first << " --> " << itr->second.value << endl;
|
cerr << "\t" << itr->first << " --> " << itr->second.value << endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Parameters::loadParams(int argc, char ** argv) {
|
bool Parameters::loadParams(int argc, char ** argv)
|
||||||
|
{
|
||||||
// load params from commandline args
|
// load params from commandline args
|
||||||
//if( argc < 3 ) {
|
//if( argc < 3 ) {
|
||||||
// fprintf(stderr, "ERROR: No parameters. Use \"-config\" or \"-f\" to specify configuration file.\n");
|
// fprintf(stderr, "ERROR: No parameters. Use \"-config\" or \"-f\" to specify configuration file.\n");
|
||||||
@ -66,7 +71,7 @@ bool Parameters::loadParams(int argc, char ** argv) {
|
|||||||
std::string val = argv[i+1];
|
std::string val = argv[i+1];
|
||||||
Utils::trim(val);
|
Utils::trim(val);
|
||||||
if( param == "config" )
|
if( param == "config" )
|
||||||
load_from_file = true;
|
load_from_file = true;
|
||||||
if(!setParamValue(param, val)) {
|
if(!setParamValue(param, val)) {
|
||||||
std::cerr << "Invalid Param name->value " << param << "->" << val << std::endl;
|
std::cerr << "Invalid Param name->value " << param << "->" << val << std::endl;
|
||||||
return false;
|
return false;
|
||||||
@ -80,35 +85,40 @@ bool Parameters::loadParams(int argc, char ** argv) {
|
|||||||
return success;
|
return success;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string Parameters::normaliseParamName(const std::string & name) {
|
std::string Parameters::normaliseParamName(const std::string & name)
|
||||||
|
{
|
||||||
// Map valid abbreviations to long names. Retain other names.
|
// Map valid abbreviations to long names. Retain other names.
|
||||||
if( params_.find(name) == params_.end() )
|
if( params_.find(name) == params_.end() )
|
||||||
iterate(params_, i)
|
iterate(params_, i)
|
||||||
if( i->second.abbrev == name )
|
if( i->second.abbrev == name )
|
||||||
return i->first;
|
return i->first;
|
||||||
return name;
|
return name;
|
||||||
}
|
}
|
||||||
|
|
||||||
int Parameters::getValueType(const std::string& name) {
|
int Parameters::getValueType(const std::string& name)
|
||||||
|
{
|
||||||
if(params_.find(name) != params_.end())
|
if(params_.find(name) != params_.end())
|
||||||
return params_[name].type;
|
return params_[name].type;
|
||||||
return Parameters::kUndefinedValue;
|
return Parameters::kUndefinedValue;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Parameters::isValidParamName(const std::string & name) {
|
bool Parameters::isValidParamName(const std::string & name)
|
||||||
return params_.find(name) != params_.end();
|
{
|
||||||
|
return params_.find(name) != params_.end();
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Parameters::setParamValue(const std::string& name, const std::string& val) {
|
bool Parameters::setParamValue(const std::string& name, const std::string& val)
|
||||||
// TODO: Add basic type checking w verifyValueType()
|
{
|
||||||
bool set = isValidParamName(name);
|
// TODO: Add basic type checking w verifyValueType()
|
||||||
if(set) {
|
bool set = isValidParamName(name);
|
||||||
params_[name].value = val;
|
if(set) {
|
||||||
|
params_[name].value = val;
|
||||||
std::cerr << "PARAM SET: "<< name << "=" << val << std::endl;
|
std::cerr << "PARAM SET: "<< name << "=" << val << std::endl;
|
||||||
}
|
}
|
||||||
return( set );
|
return( set );
|
||||||
}
|
}
|
||||||
std::string Parameters::getParamValue(const std::string& name) {
|
std::string Parameters::getParamValue(const std::string& name)
|
||||||
|
{
|
||||||
std::string value = Parameters::kNotSetValue;
|
std::string value = Parameters::kNotSetValue;
|
||||||
if(isValidParamName(name))
|
if(isValidParamName(name))
|
||||||
if(params_.find(name) != params_.end())
|
if(params_.find(name) != params_.end())
|
||||||
@ -117,43 +127,46 @@ std::string Parameters::getParamValue(const std::string& name) {
|
|||||||
value = kFalseValue;
|
value = kFalseValue;
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
std::string Parameters::getParam(const std::string& name) {
|
std::string Parameters::getParam(const std::string& name)
|
||||||
|
{
|
||||||
return getParamValue(name);
|
return getParamValue(name);
|
||||||
/*void* Parameters::getParam(const std::string& name) {
|
/*void* Parameters::getParam(const std::string& name) {
|
||||||
void* paramVal = 0;
|
void* paramVal = 0;
|
||||||
int type = getValueType(name);
|
int type = getValueType(name);
|
||||||
const char* sval = getParamValue(name).c_str();
|
const char* sval = getParamValue(name).c_str();
|
||||||
switch(type) {
|
switch(type) {
|
||||||
case kIntValue: {
|
case kIntValue: {
|
||||||
int ival = atoi(sval);
|
int ival = atoi(sval);
|
||||||
paramVal = (void*)&ival;
|
paramVal = (void*)&ival;
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
|
case kFloatValue: {
|
||||||
|
float fval = atof(sval);
|
||||||
|
paramVal = (void*)&fval;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case kStringValue: {
|
||||||
|
paramVal = (void*)sval;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case kBoolValue: {
|
||||||
|
bool bval = sval == Parameters::kTrueValue ? true : false;
|
||||||
|
paramVal = (void*)&bval;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default: // --> Parameters::kUndefinedValue
|
||||||
|
paramVal = (void*)sval; // will set to Parameters::kNotSetValue
|
||||||
}
|
}
|
||||||
case kFloatValue: {
|
return paramVal;*/
|
||||||
float fval = atof(sval);
|
|
||||||
paramVal = (void*)&fval;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case kStringValue: {
|
|
||||||
paramVal = (void*)sval;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case kBoolValue: {
|
|
||||||
bool bval = sval == Parameters::kTrueValue ? true : false;
|
|
||||||
paramVal = (void*)&bval;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
default: // --> Parameters::kUndefinedValue
|
|
||||||
paramVal = (void*)sval; // will set to Parameters::kNotSetValue
|
|
||||||
}
|
|
||||||
return paramVal;*/
|
|
||||||
}
|
}
|
||||||
bool Parameters::verifyValueType(const std::string& name, const std::string& val) {
|
bool Parameters::verifyValueType(const std::string& name, const std::string& val)
|
||||||
|
{
|
||||||
// Implement basic type checking
|
// Implement basic type checking
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
int Parameters::getParamCount() const {
|
int Parameters::getParamCount() const
|
||||||
|
{
|
||||||
return params_.size();
|
return params_.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -161,7 +174,8 @@ int Parameters::getParamCount() const {
|
|||||||
* HAVE TO CHANGE loadParams() from file to not overwrite command lines but
|
* HAVE TO CHANGE loadParams() from file to not overwrite command lines but
|
||||||
* override default if different*/
|
* override default if different*/
|
||||||
bool Parameters::loadParams(const std::string & file_path,
|
bool Parameters::loadParams(const std::string & file_path,
|
||||||
std::set<std::string>& setParams) {
|
std::set<std::string>& setParams)
|
||||||
|
{
|
||||||
// parameters loaded from file don't override cmd line paramters
|
// parameters loaded from file don't override cmd line paramters
|
||||||
/*std::set<std::string>::iterator end = setParams.end();
|
/*std::set<std::string>::iterator end = setParams.end();
|
||||||
FileHandler file(file_path.c_str(), std::ios::in);
|
FileHandler file(file_path.c_str(), std::ios::in);
|
||||||
|
@ -10,20 +10,22 @@
|
|||||||
#include "utils.h"
|
#include "utils.h"
|
||||||
#include "types.h"
|
#include "types.h"
|
||||||
|
|
||||||
#define NumOfParams(paramArray) (sizeof(paramArray)/sizeof((paramArray)[0]))
|
#define NumOfParams(paramArray) (sizeof(paramArray)/sizeof((paramArray)[0]))
|
||||||
|
|
||||||
namespace Moses {
|
namespace Moses
|
||||||
|
{
|
||||||
typedef struct ParamDefs {
|
typedef struct ParamDefs {
|
||||||
std::string name;
|
std::string name;
|
||||||
std::string value;
|
std::string value;
|
||||||
std::string abbrev;
|
std::string abbrev;
|
||||||
int type;
|
int type;
|
||||||
std::string description;
|
std::string description;
|
||||||
} ParamDefs;
|
} ParamDefs;
|
||||||
|
|
||||||
class Parameters {
|
class Parameters
|
||||||
|
{
|
||||||
public:
|
public:
|
||||||
static const std::string kNotSetValue;
|
static const std::string kNotSetValue;
|
||||||
static const int kBoolValue;
|
static const int kBoolValue;
|
||||||
static const int kIntValue;
|
static const int kIntValue;
|
||||||
static const int kFloatValue;
|
static const int kFloatValue;
|
||||||
@ -31,15 +33,15 @@ public:
|
|||||||
static const int kUndefinedValue;
|
static const int kUndefinedValue;
|
||||||
static const std::string kFalseValue;
|
static const std::string kFalseValue;
|
||||||
static const std::string kTrueValue;
|
static const std::string kTrueValue;
|
||||||
|
|
||||||
Parameters(const ParamDefs * paramdefs, const count_t paramNum);
|
Parameters(const ParamDefs * paramdefs, const count_t paramNum);
|
||||||
Parameters(int argc, char** argv, const ParamDefs * paramdefs, const count_t paramNum);
|
Parameters(int argc, char** argv, const ParamDefs * paramdefs, const count_t paramNum);
|
||||||
~Parameters() {}
|
~Parameters() {}
|
||||||
bool loadParams(int argc, char ** argv);
|
bool loadParams(int argc, char ** argv);
|
||||||
bool loadParams(const std::string& param_file, std::set<std::string>&);
|
bool loadParams(const std::string& param_file, std::set<std::string>&);
|
||||||
int getValueType(const std::string & name);
|
int getValueType(const std::string & name);
|
||||||
bool setParamValue(const std::string& name, const std::string& value);
|
bool setParamValue(const std::string& name, const std::string& value);
|
||||||
bool verifyValueType(const std::string& name, const std::string& value);
|
bool verifyValueType(const std::string& name, const std::string& value);
|
||||||
bool isValidParamName(const std::string & name);
|
bool isValidParamName(const std::string & name);
|
||||||
std::string getParamValue(const std::string& name);
|
std::string getParamValue(const std::string& name);
|
||||||
//void* getParam(const std::string& name);
|
//void* getParam(const std::string& name);
|
||||||
|
@ -8,17 +8,18 @@
|
|||||||
#include "RandLMFilter.h"
|
#include "RandLMFilter.h"
|
||||||
#include "quantizer.h"
|
#include "quantizer.h"
|
||||||
/*
|
/*
|
||||||
* PerfectHash handles setting up hash functions and storage
|
* PerfectHash handles setting up hash functions and storage
|
||||||
* for LM data.
|
* for LM data.
|
||||||
*/
|
*/
|
||||||
using randlm::Filter;
|
using randlm::Filter;
|
||||||
using randlm::BitFilter;
|
using randlm::BitFilter;
|
||||||
typedef std::map<string, count_t> hpDict_t;
|
typedef std::map<string, count_t> hpDict_t;
|
||||||
typedef hpDict_t::iterator hpdEntry_t;
|
typedef hpDict_t::iterator hpdEntry_t;
|
||||||
static count_t collisions_ = 0;
|
static count_t collisions_ = 0;
|
||||||
/* Based on Mortenson et. al. 2006 */
|
/* Based on Mortenson et. al. 2006 */
|
||||||
template<typename T>
|
template<typename T>
|
||||||
class PerfectHash {
|
class PerfectHash
|
||||||
|
{
|
||||||
public:
|
public:
|
||||||
PerfectHash(uint16_t MBs, int width, int bucketRange, float qBase);
|
PerfectHash(uint16_t MBs, int width, int bucketRange, float qBase);
|
||||||
PerfectHash(FileHandler* fin) {
|
PerfectHash(FileHandler* fin) {
|
||||||
@ -39,11 +40,11 @@ protected:
|
|||||||
uint8_t* idxTracker_;
|
uint8_t* idxTracker_;
|
||||||
uint64_t insert(const wordID_t* IDs, const int len, const count_t value);
|
uint64_t insert(const wordID_t* IDs, const int len, const count_t value);
|
||||||
bool update(const wordID_t* IDs, const int len, const count_t value,
|
bool update(const wordID_t* IDs, const int len, const count_t value,
|
||||||
hpdEntry_t& hpdAddr, uint64_t& filterIdx);
|
hpdEntry_t& hpdAddr, uint64_t& filterIdx);
|
||||||
bool update2(const wordID_t* IDs, const int len, const count_t value,
|
bool update2(const wordID_t* IDs, const int len, const count_t value,
|
||||||
hpdEntry_t& hpdAddr, uint64_t& filterIdx);
|
hpdEntry_t& hpdAddr, uint64_t& filterIdx);
|
||||||
int query(const wordID_t* IDs, const int len,
|
int query(const wordID_t* IDs, const int len,
|
||||||
hpdEntry_t& hpdAddr, uint64_t& filterIdx);
|
hpdEntry_t& hpdAddr, uint64_t& filterIdx);
|
||||||
virtual void remove(const wordID_t* IDs, const int len);
|
virtual void remove(const wordID_t* IDs, const int len);
|
||||||
void remove(uint64_t index);
|
void remove(uint64_t index);
|
||||||
void save(FileHandler* fout);
|
void save(FileHandler* fout);
|
||||||
@ -52,32 +53,33 @@ protected:
|
|||||||
//pointer to a specific entry in a hpDict_t
|
//pointer to a specific entry in a hpDict_t
|
||||||
virtual void markQueried(hpdEntry_t&)=0;
|
virtual void markQueried(hpdEntry_t&)=0;
|
||||||
private:
|
private:
|
||||||
T nonZeroSignature(const wordID_t* IDs, const int len, count_t bucket);
|
T nonZeroSignature(const wordID_t* IDs, const int len, count_t bucket);
|
||||||
string hpDictKeyValue(const wordID_t* IDs, const int len);
|
string hpDictKeyValue(const wordID_t* IDs, const int len);
|
||||||
uint64_t memBound_; // total memory bound in bytes
|
uint64_t memBound_; // total memory bound in bytes
|
||||||
uint16_t cellWidth_; // in bits
|
uint16_t cellWidth_; // in bits
|
||||||
UnivHash_linear<count_t>* bucketHash_;
|
UnivHash_linear<count_t>* bucketHash_;
|
||||||
UnivHash_linear<T>* fingerHash_;
|
UnivHash_linear<T>* fingerHash_;
|
||||||
LogQtizer* qtizer_;
|
LogQtizer* qtizer_;
|
||||||
};
|
};
|
||||||
template<typename T>
|
template<typename T>
|
||||||
PerfectHash<T>::PerfectHash(uint16_t MBs, int width, int bucketRange,
|
PerfectHash<T>::PerfectHash(uint16_t MBs, int width, int bucketRange,
|
||||||
float qBase): hitMask_(1 << 31), memBound_(MBs * (1ULL << 20)),
|
float qBase): hitMask_(1 << 31), memBound_(MBs * (1ULL << 20)),
|
||||||
cellWidth_(width) {
|
cellWidth_(width)
|
||||||
|
{
|
||||||
bucketRange_ = static_cast<uint8_t>(bucketRange);
|
bucketRange_ = static_cast<uint8_t>(bucketRange);
|
||||||
if(bucketRange > 255) {
|
if(bucketRange > 255) {
|
||||||
cerr << "ERROR: Max bucket range is > 2^8\n";
|
cerr << "ERROR: Max bucket range is > 2^8\n";
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
qtizer_ = new LogQtizer(qBase);
|
qtizer_ = new LogQtizer(qBase);
|
||||||
int valBits = (int)ceil(log2((float)qtizer_->maxcode()));
|
int valBits = (int)ceil(log2((float)qtizer_->maxcode()));
|
||||||
cerr << "BITS FOR VALUES ARRAY = " << valBits << endl;
|
cerr << "BITS FOR VALUES ARRAY = " << valBits << endl;
|
||||||
uint64_t totalBits = memBound_ << 3;
|
uint64_t totalBits = memBound_ << 3;
|
||||||
cells_ = (uint64_t) ceil((float)totalBits / (float)(cellWidth_ + valBits)); // upper bound on cells
|
cells_ = (uint64_t) ceil((float)totalBits / (float)(cellWidth_ + valBits)); // upper bound on cells
|
||||||
cells_ += (cells_ % bucketRange_); // make cells multiple of bucket range
|
cells_ += (cells_ % bucketRange_); // make cells multiple of bucket range
|
||||||
totBuckets_ = (cells_ / bucketRange_) - 1; // minus 1 so totBuckets * bucksize + bucksize = cells
|
totBuckets_ = (cells_ / bucketRange_) - 1; // minus 1 so totBuckets * bucksize + bucksize = cells
|
||||||
filter_ = new Filter<T>(cells_, cellWidth_);
|
filter_ = new Filter<T>(cells_, cellWidth_);
|
||||||
values_ = new Filter<T>(cells_, valBits);
|
values_ = new Filter<T>(cells_, valBits);
|
||||||
idxTracker_ = new uint8_t[totBuckets_];
|
idxTracker_ = new uint8_t[totBuckets_];
|
||||||
for(int i=0; i < totBuckets_; ++i) idxTracker_[i] = 0;
|
for(int i=0; i < totBuckets_; ++i) idxTracker_[i] = 0;
|
||||||
// initialize ranges for each hash function
|
// initialize ranges for each hash function
|
||||||
@ -85,7 +87,8 @@ PerfectHash<T>::PerfectHash(uint16_t MBs, int width, int bucketRange,
|
|||||||
fingerHash_ = new UnivHash_linear<T>(pow(2.0f, cellWidth_), MAX_HASH_FUNCS, PRIME);
|
fingerHash_ = new UnivHash_linear<T>(pow(2.0f, cellWidth_), MAX_HASH_FUNCS, PRIME);
|
||||||
}
|
}
|
||||||
template<typename T>
|
template<typename T>
|
||||||
PerfectHash<T>::~PerfectHash() {
|
PerfectHash<T>::~PerfectHash()
|
||||||
|
{
|
||||||
delete[] idxTracker_;
|
delete[] idxTracker_;
|
||||||
delete filter_;
|
delete filter_;
|
||||||
filter_ = NULL;
|
filter_ = NULL;
|
||||||
@ -94,22 +97,22 @@ PerfectHash<T>::~PerfectHash() {
|
|||||||
delete qtizer_;
|
delete qtizer_;
|
||||||
delete values_;
|
delete values_;
|
||||||
}
|
}
|
||||||
template<typename T>
|
template<typename T>
|
||||||
uint64_t PerfectHash<T>::insert(const wordID_t* IDs, const int len,
|
uint64_t PerfectHash<T>::insert(const wordID_t* IDs, const int len,
|
||||||
const count_t value) {
|
const count_t value)
|
||||||
|
{
|
||||||
count_t bucket = (bucketHash_->size() > 1 ? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len));
|
count_t bucket = (bucketHash_->size() > 1 ? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len));
|
||||||
if(idxTracker_[bucket] < (int)bucketRange_) { // if empty rows
|
if(idxTracker_[bucket] < (int)bucketRange_) { // if empty rows
|
||||||
// restriction on fprint value is non-zero
|
// restriction on fprint value is non-zero
|
||||||
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
|
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
|
||||||
uint64_t emptyidx = cells_ + 1;
|
uint64_t emptyidx = cells_ + 1;
|
||||||
uint64_t index = bucket * bucketRange_, // starting bucket row
|
uint64_t index = bucket * bucketRange_, // starting bucket row
|
||||||
lastrow = index + bucketRange_; // ending row
|
lastrow = index + bucketRange_; // ending row
|
||||||
while(index < lastrow) { // unique so check each row for "matching" signature
|
while(index < lastrow) { // unique so check each row for "matching" signature
|
||||||
T filterVal = filter_->read(index);
|
T filterVal = filter_->read(index);
|
||||||
if((filterVal == 0) && (emptyidx == cells_ + 1)) { // record first empty row
|
if((filterVal == 0) && (emptyidx == cells_ + 1)) { // record first empty row
|
||||||
emptyidx = index;
|
emptyidx = index;
|
||||||
}
|
} else if(filterVal == fp) {
|
||||||
else if(filterVal == fp) {
|
|
||||||
++collisions_;
|
++collisions_;
|
||||||
dict_[hpDictKeyValue(IDs, len)] = value; // store exact in hpd
|
dict_[hpDictKeyValue(IDs, len)] = value; // store exact in hpd
|
||||||
return cells_ + 1; // finished
|
return cells_ + 1; // finished
|
||||||
@ -122,20 +125,20 @@ uint64_t PerfectHash<T>::insert(const wordID_t* IDs, const int len,
|
|||||||
values_->write(emptyidx, code);
|
values_->write(emptyidx, code);
|
||||||
++idxTracker_[bucket]; // keep track of bucket size
|
++idxTracker_[bucket]; // keep track of bucket size
|
||||||
return emptyidx;
|
return emptyidx;
|
||||||
}
|
} else { // bucket is full
|
||||||
else { // bucket is full
|
|
||||||
dict_[hpDictKeyValue(IDs, len)] = value; // add to hpd
|
dict_[hpDictKeyValue(IDs, len)] = value; // add to hpd
|
||||||
return cells_ + 1;
|
return cells_ + 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<typename T>
|
template<typename T>
|
||||||
bool PerfectHash<T>::update(const wordID_t* IDs, const int len,
|
bool PerfectHash<T>::update(const wordID_t* IDs, const int len,
|
||||||
const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx) {
|
const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx)
|
||||||
|
{
|
||||||
// check if key is in high perf. dictionary
|
// check if key is in high perf. dictionary
|
||||||
filterIdx = cells_ + 1;
|
filterIdx = cells_ + 1;
|
||||||
string skey = hpDictKeyValue(IDs, len);
|
string skey = hpDictKeyValue(IDs, len);
|
||||||
if((hpdAddr = dict_.find(skey)) != dict_.end()) {
|
if((hpdAddr = dict_.find(skey)) != dict_.end()) {
|
||||||
hpdAddr->second = value;
|
hpdAddr->second = value;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
// else hash ngram
|
// else hash ngram
|
||||||
@ -144,66 +147,67 @@ bool PerfectHash<T>::update(const wordID_t* IDs, const int len,
|
|||||||
// restriction on fprint value is non-zero
|
// restriction on fprint value is non-zero
|
||||||
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
|
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
|
||||||
uint64_t index = bucket * bucketRange_, // starting bucket row
|
uint64_t index = bucket * bucketRange_, // starting bucket row
|
||||||
lastrow = index + bucketRange_;
|
lastrow = index + bucketRange_;
|
||||||
while(index < lastrow) { // must check each row for matching fp event
|
while(index < lastrow) { // must check each row for matching fp event
|
||||||
T filterVal = filter_->read(index);
|
T filterVal = filter_->read(index);
|
||||||
if(filterVal == fp) { // found event w.h.p.
|
if(filterVal == fp) { // found event w.h.p.
|
||||||
values_->write(index, (T)qtizer_->code(value));
|
values_->write(index, (T)qtizer_->code(value));
|
||||||
filterIdx = index;
|
filterIdx = index;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
++index;
|
++index;
|
||||||
}
|
}
|
||||||
// could add if it gets here.
|
// could add if it gets here.
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
template<typename T>
|
template<typename T>
|
||||||
int PerfectHash<T>::query(const wordID_t* IDs, const int len,
|
int PerfectHash<T>::query(const wordID_t* IDs, const int len,
|
||||||
hpdEntry_t& hpdAddr, uint64_t& filterIdx) {
|
hpdEntry_t& hpdAddr, uint64_t& filterIdx)
|
||||||
|
{
|
||||||
// check if key is in high perf. dictionary
|
// check if key is in high perf. dictionary
|
||||||
string skey = hpDictKeyValue(IDs, len);
|
string skey = hpDictKeyValue(IDs, len);
|
||||||
if((hpdAddr = dict_.find(skey)) != dict_.end()) {
|
if((hpdAddr = dict_.find(skey)) != dict_.end()) {
|
||||||
filterIdx = cells_ + 1;
|
filterIdx = cells_ + 1;
|
||||||
return(hpdAddr->second); // returns copy of value
|
return(hpdAddr->second); // returns copy of value
|
||||||
}
|
} else { // check if key is in filter
|
||||||
else { // check if key is in filter
|
// get bucket
|
||||||
// get bucket
|
|
||||||
//count_t bucket = bucketHash_->hash(IDs, len);
|
//count_t bucket = bucketHash_->hash(IDs, len);
|
||||||
count_t bucket = (bucketHash_->size() > 1 ? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len));
|
count_t bucket = (bucketHash_->size() > 1 ? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len));
|
||||||
// restriction on fprint value is non-zero
|
// restriction on fprint value is non-zero
|
||||||
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
|
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
|
||||||
// return value if ngram is in filter
|
// return value if ngram is in filter
|
||||||
uint64_t index = bucket * bucketRange_,
|
uint64_t index = bucket * bucketRange_,
|
||||||
lastrow = index + bucketRange_;
|
lastrow = index + bucketRange_;
|
||||||
for(; index < lastrow; ++index) {
|
for(; index < lastrow; ++index) {
|
||||||
if(filter_->read(index) == fp) {
|
if(filter_->read(index) == fp) {
|
||||||
//cout << "fp = " << fp << "\tbucket = " << bucket << "\tfilter =" <<
|
//cout << "fp = " << fp << "\tbucket = " << bucket << "\tfilter =" <<
|
||||||
//filter_->read(index) << "\tcode = " << code << endl;
|
//filter_->read(index) << "\tcode = " << code << endl;
|
||||||
filterIdx = index;
|
filterIdx = index;
|
||||||
hpdAddr = dict_.end();
|
hpdAddr = dict_.end();
|
||||||
return (int)qtizer_->value(values_->read(index));
|
return (int)qtizer_->value(values_->read(index));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
template<typename T>
|
template<typename T>
|
||||||
void PerfectHash<T>::remove(const wordID_t* IDs, const int len) {
|
void PerfectHash<T>::remove(const wordID_t* IDs, const int len)
|
||||||
|
{
|
||||||
// delete key if in high perf. dictionary
|
// delete key if in high perf. dictionary
|
||||||
string skey = hpDictKeyValue(IDs, len);
|
string skey = hpDictKeyValue(IDs, len);
|
||||||
if(dict_.find(skey) != dict_.end())
|
if(dict_.find(skey) != dict_.end())
|
||||||
dict_.erase(skey);
|
dict_.erase(skey);
|
||||||
else { // check if key is in filter
|
else { // check if key is in filter
|
||||||
// get small representation for ngrams
|
// get small representation for ngrams
|
||||||
//count_t bucket = bucketHash_->hash(IDs, len);
|
//count_t bucket = bucketHash_->hash(IDs, len);
|
||||||
count_t bucket = (bucketHash_->size() > 1? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len));
|
count_t bucket = (bucketHash_->size() > 1? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len));
|
||||||
// retrieve non zero fingerprint for ngram
|
// retrieve non zero fingerprint for ngram
|
||||||
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
|
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
|
||||||
// return value if ngram is in filter
|
// return value if ngram is in filter
|
||||||
uint64_t index = bucket * bucketRange_,
|
uint64_t index = bucket * bucketRange_,
|
||||||
lastrow = index + bucketRange_;
|
lastrow = index + bucketRange_;
|
||||||
for(; index < lastrow; ++index) {
|
for(; index < lastrow; ++index) {
|
||||||
if(filter_->read(index) == fp) {
|
if(filter_->read(index) == fp) {
|
||||||
filter_->write(index, 0);
|
filter_->write(index, 0);
|
||||||
values_->write(index, 0);
|
values_->write(index, 0);
|
||||||
--idxTracker_[bucket]; // track bucket size reduction
|
--idxTracker_[bucket]; // track bucket size reduction
|
||||||
@ -213,7 +217,8 @@ void PerfectHash<T>::remove(const wordID_t* IDs, const int len) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<typename T> // clear filter index
|
template<typename T> // clear filter index
|
||||||
void PerfectHash<T>::remove(uint64_t index) {
|
void PerfectHash<T>::remove(uint64_t index)
|
||||||
|
{
|
||||||
CHECK(index < cells_);
|
CHECK(index < cells_);
|
||||||
CHECK(filter_->read(index) != 0); // slow
|
CHECK(filter_->read(index) != 0); // slow
|
||||||
filter_->write(index, 0);
|
filter_->write(index, 0);
|
||||||
@ -224,19 +229,21 @@ void PerfectHash<T>::remove(uint64_t index) {
|
|||||||
}
|
}
|
||||||
template<typename T>
|
template<typename T>
|
||||||
T PerfectHash<T>::nonZeroSignature(const wordID_t* IDs, const int len,
|
T PerfectHash<T>::nonZeroSignature(const wordID_t* IDs, const int len,
|
||||||
count_t bucket) {
|
count_t bucket)
|
||||||
|
{
|
||||||
count_t h = bucket;
|
count_t h = bucket;
|
||||||
T fingerprint(0);
|
T fingerprint(0);
|
||||||
do {
|
do {
|
||||||
fingerprint = fingerHash_->hash(IDs, len, h);
|
fingerprint = fingerHash_->hash(IDs, len, h);
|
||||||
h += (h < fingerHash_->size() - 1 ? 1 : -h); // wrap around
|
h += (h < fingerHash_->size() - 1 ? 1 : -h); // wrap around
|
||||||
} while((fingerprint == 0) && (h != bucket));
|
} while((fingerprint == 0) && (h != bucket));
|
||||||
if(fingerprint == 0)
|
if(fingerprint == 0)
|
||||||
cerr << "WARNING: Unable to find non-zero signature for ngram\n" << endl;
|
cerr << "WARNING: Unable to find non-zero signature for ngram\n" << endl;
|
||||||
return fingerprint;
|
return fingerprint;
|
||||||
}
|
}
|
||||||
template<typename T>
|
template<typename T>
|
||||||
string PerfectHash<T>::hpDictKeyValue(const wordID_t* IDs, const int len) {
|
string PerfectHash<T>::hpDictKeyValue(const wordID_t* IDs, const int len)
|
||||||
|
{
|
||||||
string skey(" ");
|
string skey(" ");
|
||||||
for(int i = 0; i < len; ++i)
|
for(int i = 0; i < len; ++i)
|
||||||
skey += Utils::IntToStr(IDs[i]) + "¬";
|
skey += Utils::IntToStr(IDs[i]) + "¬";
|
||||||
@ -244,17 +251,20 @@ string PerfectHash<T>::hpDictKeyValue(const wordID_t* IDs, const int len) {
|
|||||||
return skey;
|
return skey;
|
||||||
}
|
}
|
||||||
template<typename T>
|
template<typename T>
|
||||||
count_t PerfectHash<T>::hpDictMemUse() {
|
count_t PerfectHash<T>::hpDictMemUse()
|
||||||
|
{
|
||||||
// return hpDict memory usage in MBs
|
// return hpDict memory usage in MBs
|
||||||
return (count_t) sizeof(hpDict_t::value_type)* dict_.size() >> 20;
|
return (count_t) sizeof(hpDict_t::value_type)* dict_.size() >> 20;
|
||||||
}
|
}
|
||||||
template<typename T>
|
template<typename T>
|
||||||
count_t PerfectHash<T>::bucketsMemUse() {
|
count_t PerfectHash<T>::bucketsMemUse()
|
||||||
|
{
|
||||||
// return bucket memory usage in MBs
|
// return bucket memory usage in MBs
|
||||||
return (count_t) (filter_->size() + values_->size());
|
return (count_t) (filter_->size() + values_->size());
|
||||||
}
|
}
|
||||||
template<typename T>
|
template<typename T>
|
||||||
void PerfectHash<T>::save(FileHandler* fout) {
|
void PerfectHash<T>::save(FileHandler* fout)
|
||||||
|
{
|
||||||
CHECK(fout != 0);
|
CHECK(fout != 0);
|
||||||
cerr << "\tSaving perfect hash parameters...\n";
|
cerr << "\tSaving perfect hash parameters...\n";
|
||||||
fout->write((char*)&hitMask_, sizeof(hitMask_));
|
fout->write((char*)&hitMask_, sizeof(hitMask_));
|
||||||
@ -275,11 +285,12 @@ void PerfectHash<T>::save(FileHandler* fout) {
|
|||||||
count_t size = dict_.size();
|
count_t size = dict_.size();
|
||||||
fout->write((char*)&size, sizeof(count_t));
|
fout->write((char*)&size, sizeof(count_t));
|
||||||
*fout << endl;
|
*fout << endl;
|
||||||
iterate(dict_, t)
|
iterate(dict_, t)
|
||||||
*fout << t->first << "\t" << t->second << "\n";
|
*fout << t->first << "\t" << t->second << "\n";
|
||||||
}
|
}
|
||||||
template<typename T>
|
template<typename T>
|
||||||
void PerfectHash<T>::load(FileHandler* fin) {
|
void PerfectHash<T>::load(FileHandler* fin)
|
||||||
|
{
|
||||||
CHECK(fin != 0);
|
CHECK(fin != 0);
|
||||||
cerr << "\tLoading perfect hash parameters...\n";
|
cerr << "\tLoading perfect hash parameters...\n";
|
||||||
fin->read((char*)&hitMask_, sizeof(hitMask_));
|
fin->read((char*)&hitMask_, sizeof(hitMask_));
|
||||||
@ -315,12 +326,13 @@ void PerfectHash<T>::load(FileHandler* fin) {
|
|||||||
cerr << "Finished loading ORLM." << endl;
|
cerr << "Finished loading ORLM." << endl;
|
||||||
}
|
}
|
||||||
template<typename T>
|
template<typename T>
|
||||||
void PerfectHash<T>::analyze() {
|
void PerfectHash<T>::analyze()
|
||||||
|
{
|
||||||
cerr << "Analyzing Dynamic Bloomier Filter...\n";
|
cerr << "Analyzing Dynamic Bloomier Filter...\n";
|
||||||
// see how many items in each bucket
|
// see how many items in each bucket
|
||||||
uint8_t* bucketCnt = new uint8_t[totBuckets_];
|
uint8_t* bucketCnt = new uint8_t[totBuckets_];
|
||||||
unsigned largestBucket = 0, totalCellsSet = 0,
|
unsigned largestBucket = 0, totalCellsSet = 0,
|
||||||
smallestBucket = bucketRange_, totalZeroes = 0;
|
smallestBucket = bucketRange_, totalZeroes = 0;
|
||||||
int curBucket = -1, fullBuckets(0);
|
int curBucket = -1, fullBuckets(0);
|
||||||
for(int i = 0; i < totBuckets_; ++i) bucketCnt[i] = 0;
|
for(int i = 0; i < totBuckets_; ++i) bucketCnt[i] = 0;
|
||||||
for(uint64_t i =0; i < cells_; ++i) {
|
for(uint64_t i =0; i < cells_; ++i) {
|
||||||
@ -328,16 +340,14 @@ void PerfectHash<T>::analyze() {
|
|||||||
if(filter_->read(i) != 0) {
|
if(filter_->read(i) != 0) {
|
||||||
++bucketCnt[curBucket];
|
++bucketCnt[curBucket];
|
||||||
++totalCellsSet;
|
++totalCellsSet;
|
||||||
}
|
} else ++totalZeroes;
|
||||||
else ++totalZeroes;
|
|
||||||
}
|
}
|
||||||
count_t bi = 0, si = 0;
|
count_t bi = 0, si = 0;
|
||||||
for(int i = 0; i < totBuckets_; ++i) {
|
for(int i = 0; i < totBuckets_; ++i) {
|
||||||
if(bucketCnt[i] > largestBucket) {
|
if(bucketCnt[i] > largestBucket) {
|
||||||
largestBucket = bucketCnt[i];
|
largestBucket = bucketCnt[i];
|
||||||
bi = i;
|
bi = i;
|
||||||
}
|
} else if(bucketCnt[i] < smallestBucket) {
|
||||||
else if(bucketCnt[i] < smallestBucket) {
|
|
||||||
smallestBucket = bucketCnt[i];
|
smallestBucket = bucketCnt[i];
|
||||||
si = i;
|
si = i;
|
||||||
}
|
}
|
||||||
@ -350,8 +360,8 @@ void PerfectHash<T>::analyze() {
|
|||||||
}
|
}
|
||||||
for(int i = 0; i < totBuckets_; ++i) {
|
for(int i = 0; i < totBuckets_; ++i) {
|
||||||
if(bucketCnt[i] != idxTracker_[i])
|
if(bucketCnt[i] != idxTracker_[i])
|
||||||
cerr << "bucketCnt[" << i << "] = " << (int)bucketCnt[i] <<
|
cerr << "bucketCnt[" << i << "] = " << (int)bucketCnt[i] <<
|
||||||
"\tidxTracker_[" << i << "] = " << (int)idxTracker_[i] << endl;
|
"\tidxTracker_[" << i << "] = " << (int)idxTracker_[i] << endl;
|
||||||
}
|
}
|
||||||
cerr << "total cells= " << cells_ << endl;
|
cerr << "total cells= " << cells_ << endl;
|
||||||
cerr << "total buckets= " << totBuckets_ << endl;
|
cerr << "total buckets= " << totBuckets_ << endl;
|
||||||
@ -364,7 +374,7 @@ void PerfectHash<T>::analyze() {
|
|||||||
cerr << "largest bucket (" << bi << ") size= " << largestBucket << endl;
|
cerr << "largest bucket (" << bi << ") size= " << largestBucket << endl;
|
||||||
cerr << "smallest bucket (" << si << ") size= " << smallestBucket << endl;
|
cerr << "smallest bucket (" << si << ") size= " << smallestBucket << endl;
|
||||||
cerr << "last bucket size= " << (int)bucketCnt[totBuckets_ - 1] <<
|
cerr << "last bucket size= " << (int)bucketCnt[totBuckets_ - 1] <<
|
||||||
" (idxTracker last bucket size = " << (int)idxTracker_[totBuckets_ - 1] << ")" << endl;
|
" (idxTracker last bucket size = " << (int)idxTracker_[totBuckets_ - 1] << ")" << endl;
|
||||||
cerr << "total buckets full = " << fullBuckets << endl;
|
cerr << "total buckets full = " << fullBuckets << endl;
|
||||||
cerr << "total collision errors= " << collisions_ << endl;
|
cerr << "total collision errors= " << collisions_ << endl;
|
||||||
cerr << "high performance dictionary size= " << dict_.size() << endl;
|
cerr << "high performance dictionary size= " << dict_.size() << endl;
|
||||||
@ -373,14 +383,15 @@ void PerfectHash<T>::analyze() {
|
|||||||
cerr << "values MBs= " << values_->size() << endl;
|
cerr << "values MBs= " << values_->size() << endl;
|
||||||
delete[] bucketCnt;
|
delete[] bucketCnt;
|
||||||
}
|
}
|
||||||
template<typename T>
|
template<typename T>
|
||||||
bool PerfectHash<T>::update2(const wordID_t* IDs, const int len,
|
bool PerfectHash<T>::update2(const wordID_t* IDs, const int len,
|
||||||
const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx) {
|
const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx)
|
||||||
|
{
|
||||||
// check if key is in high perf. dictionary
|
// check if key is in high perf. dictionary
|
||||||
filterIdx = cells_ + 1;
|
filterIdx = cells_ + 1;
|
||||||
string skey = hpDictKeyValue(IDs, len);
|
string skey = hpDictKeyValue(IDs, len);
|
||||||
if((hpdAddr = dict_.find(skey)) != dict_.end()) {
|
if((hpdAddr = dict_.find(skey)) != dict_.end()) {
|
||||||
hpdAddr->second += value;
|
hpdAddr->second += value;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
// else hash ngram
|
// else hash ngram
|
||||||
@ -389,18 +400,18 @@ bool PerfectHash<T>::update2(const wordID_t* IDs, const int len,
|
|||||||
// restriction on fprint value is non-zero
|
// restriction on fprint value is non-zero
|
||||||
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
|
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
|
||||||
uint64_t index = bucket * bucketRange_, // starting bucket row
|
uint64_t index = bucket * bucketRange_, // starting bucket row
|
||||||
lastrow = index + bucketRange_;
|
lastrow = index + bucketRange_;
|
||||||
while(index < lastrow) { // must check each row for matching fp event
|
while(index < lastrow) { // must check each row for matching fp event
|
||||||
T filterVal = filter_->read(index);
|
T filterVal = filter_->read(index);
|
||||||
if(filterVal == fp) { // found event w.h.p.
|
if(filterVal == fp) { // found event w.h.p.
|
||||||
int oldval = (int)qtizer_->value(values_->read(index));
|
int oldval = (int)qtizer_->value(values_->read(index));
|
||||||
values_->write(index, (T)qtizer_->code(oldval + value));
|
values_->write(index, (T)qtizer_->code(oldval + value));
|
||||||
filterIdx = index;
|
filterIdx = index;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
++index;
|
++index;
|
||||||
}
|
}
|
||||||
// add if it gets here.
|
// add if it gets here.
|
||||||
insert(IDs, len, value);
|
insert(IDs, len, value);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -8,7 +8,8 @@
|
|||||||
#include "types.h"
|
#include "types.h"
|
||||||
|
|
||||||
static const float kFloatErr = 0.00001f;
|
static const float kFloatErr = 0.00001f;
|
||||||
class LogQtizer {
|
class LogQtizer
|
||||||
|
{
|
||||||
public:
|
public:
|
||||||
LogQtizer(float i): base_(pow(2, 1 / i)) {
|
LogQtizer(float i): base_(pow(2, 1 / i)) {
|
||||||
CHECK(base_ > 1);
|
CHECK(base_ > 1);
|
||||||
@ -16,8 +17,8 @@ public:
|
|||||||
float value = 1; // code = 1 -> value = 1 for any base
|
float value = 1; // code = 1 -> value = 1 for any base
|
||||||
std::vector<float> code_to_value_vec;
|
std::vector<float> code_to_value_vec;
|
||||||
while (log2(value) < 30) { // assume 2^30 is largest count
|
while (log2(value) < 30) { // assume 2^30 is largest count
|
||||||
code_to_value_vec.push_back(value);
|
code_to_value_vec.push_back(value);
|
||||||
value = pow(base_, ++max_code_);
|
value = pow(base_, ++max_code_);
|
||||||
}
|
}
|
||||||
code_to_value_vec.push_back(value); // store max_code_ so in total [0, max_code_]
|
code_to_value_vec.push_back(value); // store max_code_ so in total [0, max_code_]
|
||||||
// get valid range
|
// get valid range
|
||||||
@ -40,22 +41,22 @@ public:
|
|||||||
int code(float value) {
|
int code(float value) {
|
||||||
// should just be: return log_b(value)
|
// should just be: return log_b(value)
|
||||||
CHECK(!(value < min_value_ || value > max_value_));
|
CHECK(!(value < min_value_ || value > max_value_));
|
||||||
// but binary search removes errors due to floor operator above
|
// but binary search removes errors due to floor operator above
|
||||||
int code = static_cast<int>(std::lower_bound(code_to_value_, code_to_value_+ max_code_,
|
int code = static_cast<int>(std::lower_bound(code_to_value_, code_to_value_+ max_code_,
|
||||||
value) - code_to_value_);
|
value) - code_to_value_);
|
||||||
// make sure not overestimating
|
// make sure not overestimating
|
||||||
code = code_to_value_[code] > value ? code - 1 : code;
|
code = code_to_value_[code] > value ? code - 1 : code;
|
||||||
return code;
|
return code;
|
||||||
}
|
}
|
||||||
inline float value(int code) {
|
inline float value(int code) {
|
||||||
// table look up for values
|
// table look up for values
|
||||||
return code_to_value_[code];
|
return code_to_value_[code];
|
||||||
}
|
}
|
||||||
inline int maxcode() {
|
inline int maxcode() {
|
||||||
return max_code_;
|
return max_code_;
|
||||||
}
|
}
|
||||||
inline float logValue(int code) {
|
inline float logValue(int code) {
|
||||||
// table look up for log of values
|
// table look up for log of values
|
||||||
return code_to_log_value_[code];
|
return code_to_log_value_[code];
|
||||||
}
|
}
|
||||||
~LogQtizer() {
|
~LogQtizer() {
|
||||||
@ -69,15 +70,15 @@ public:
|
|||||||
fout->write((char*)&min_value_, sizeof(min_value_));
|
fout->write((char*)&min_value_, sizeof(min_value_));
|
||||||
for (int j = 0; j <= max_code_; ++j)
|
for (int j = 0; j <= max_code_; ++j)
|
||||||
fout->write((char*)&code_to_value_[j], sizeof(code_to_value_[j]));
|
fout->write((char*)&code_to_value_[j], sizeof(code_to_value_[j]));
|
||||||
for (int j = 0; j <= max_code_; ++j)
|
for (int j = 0; j <= max_code_; ++j)
|
||||||
fout->write((char*)&code_to_log_value_[j], sizeof(code_to_log_value_[j]));
|
fout->write((char*)&code_to_log_value_[j], sizeof(code_to_log_value_[j]));
|
||||||
std::cerr << "Saved log codebook with " << max_code_ + 1 << " codes." <<std::endl;
|
std::cerr << "Saved log codebook with " << max_code_ + 1 << " codes." <<std::endl;
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
float base_;
|
float base_;
|
||||||
float* code_to_value_;
|
float* code_to_value_;
|
||||||
float* code_to_log_value_;
|
float* code_to_log_value_;
|
||||||
int max_code_;
|
int max_code_;
|
||||||
float max_value_;
|
float max_value_;
|
||||||
float min_value_;
|
float min_value_;
|
||||||
void load(FileHandler* fin) {
|
void load(FileHandler* fin) {
|
||||||
|
@ -103,10 +103,11 @@ bool Vocab::Load(const std::string & vocab_path, const FactorDirection& directio
|
|||||||
std::cerr << "Loading vocab from " << vocab_path << std::endl;
|
std::cerr << "Loading vocab from " << vocab_path << std::endl;
|
||||||
return Load(&vcbin, direction, factors, closed);
|
return Load(&vcbin, direction, factors, closed);
|
||||||
}
|
}
|
||||||
bool Vocab::Load(FileHandler* vcbin) {
|
bool Vocab::Load(FileHandler* vcbin)
|
||||||
|
{
|
||||||
FactorList factors;
|
FactorList factors;
|
||||||
factors.push_back(0);
|
factors.push_back(0);
|
||||||
return Load(vcbin, Input, factors);
|
return Load(vcbin, Input, factors);
|
||||||
}
|
}
|
||||||
bool Vocab::Load(FileHandler* vcbin, const FactorDirection& direction,
|
bool Vocab::Load(FileHandler* vcbin, const FactorDirection& direction,
|
||||||
const FactorList& factors, bool closed)
|
const FactorList& factors, bool closed)
|
||||||
|
@ -74,12 +74,12 @@ int DynSuffixArray::F_firstIdx(unsigned word)
|
|||||||
// return index of first row where word is found in m_F
|
// return index of first row where word is found in m_F
|
||||||
/*for(int i=0; i < m_F->size(); ++i) {
|
/*for(int i=0; i < m_F->size(); ++i) {
|
||||||
if(m_F->at(i) == word) {
|
if(m_F->at(i) == word) {
|
||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return -1;*/
|
return -1;*/
|
||||||
//NOTE: lower_bound is faster than linear search above but may cause issues
|
//NOTE: lower_bound is faster than linear search above but may cause issues
|
||||||
// if ordering of vocab is not consecutive (ie..after deletions)
|
// if ordering of vocab is not consecutive (ie..after deletions)
|
||||||
int low = std::lower_bound(m_F->begin(), m_F->end(), word) - m_F->begin();
|
int low = std::lower_bound(m_F->begin(), m_F->end(), word) - m_F->begin();
|
||||||
//cerr << "in F_firstIdx with word = " << word << " and low = " << low << " and F->size() =" << m_F->size() << endl;
|
//cerr << "in F_firstIdx with word = " << word << " and low = " << low << " and F->size() =" << m_F->size() << endl;
|
||||||
if(low >= m_F->size())
|
if(low >= m_F->size())
|
||||||
@ -146,8 +146,8 @@ void DynSuffixArray::Reorder(unsigned j, unsigned jprime)
|
|||||||
{
|
{
|
||||||
set<pair<unsigned, unsigned> > seen;
|
set<pair<unsigned, unsigned> > seen;
|
||||||
while(j != jprime) {
|
while(j != jprime) {
|
||||||
// this 'seenit' check added for data with many loops. will remove after double
|
// this 'seenit' check added for data with many loops. will remove after double
|
||||||
// checking.
|
// checking.
|
||||||
bool seenit = seen.insert(std::make_pair(j, jprime)).second;
|
bool seenit = seen.insert(std::make_pair(j, jprime)).second;
|
||||||
if(seenit) {
|
if(seenit) {
|
||||||
for(int i=1; i < m_SA->size(); ++i) {
|
for(int i=1; i < m_SA->size(); ++i) {
|
||||||
@ -163,9 +163,9 @@ void DynSuffixArray::Reorder(unsigned j, unsigned jprime)
|
|||||||
int new_j = LastFirstFunc(j);
|
int new_j = LastFirstFunc(j);
|
||||||
CHECK(j <= jprime);
|
CHECK(j <= jprime);
|
||||||
// for SA and L, the element at pos j is moved to pos j'
|
// for SA and L, the element at pos j is moved to pos j'
|
||||||
m_L->insert(m_L->begin() + jprime + 1, m_L->at(j));
|
m_L->insert(m_L->begin() + jprime + 1, m_L->at(j));
|
||||||
m_L->erase(m_L->begin() + j);
|
m_L->erase(m_L->begin() + j);
|
||||||
m_SA->insert(m_SA->begin() + jprime + 1, m_SA->at(j));
|
m_SA->insert(m_SA->begin() + jprime + 1, m_SA->at(j));
|
||||||
m_SA->erase(m_SA->begin() + j);
|
m_SA->erase(m_SA->begin() + j);
|
||||||
// all ISA values between (j...j'] decremented
|
// all ISA values between (j...j'] decremented
|
||||||
for(size_t i = 0; i < m_ISA->size(); ++i) {
|
for(size_t i = 0; i < m_ISA->size(); ++i) {
|
||||||
|
@ -33,9 +33,9 @@ namespace Moses
|
|||||||
class FactorFriend;
|
class FactorFriend;
|
||||||
class FactorCollection;
|
class FactorCollection;
|
||||||
|
|
||||||
/** Represents a factor (word, POS, etc).
|
/** Represents a factor (word, POS, etc).
|
||||||
*
|
*
|
||||||
* A Factor has a contiguous identifier and string value.
|
* A Factor has a contiguous identifier and string value.
|
||||||
*/
|
*/
|
||||||
class Factor
|
class Factor
|
||||||
{
|
{
|
||||||
@ -45,17 +45,17 @@ class Factor
|
|||||||
friend class FactorCollection;
|
friend class FactorCollection;
|
||||||
friend class FactorFriend;
|
friend class FactorFriend;
|
||||||
|
|
||||||
// FactorCollection writes here.
|
// FactorCollection writes here.
|
||||||
std::string m_string;
|
std::string m_string;
|
||||||
size_t m_id;
|
size_t m_id;
|
||||||
|
|
||||||
//! protected constructor. only friend class, FactorCollection, is allowed to create Factor objects
|
//! protected constructor. only friend class, FactorCollection, is allowed to create Factor objects
|
||||||
Factor() {}
|
Factor() {}
|
||||||
|
|
||||||
// Needed for STL containers. They'll delegate through FactorFriend, which is never exposed publicly.
|
// Needed for STL containers. They'll delegate through FactorFriend, which is never exposed publicly.
|
||||||
Factor(const Factor &factor) : m_string(factor.m_string), m_id(factor.m_id) {}
|
Factor(const Factor &factor) : m_string(factor.m_string), m_id(factor.m_id) {}
|
||||||
|
|
||||||
// Not implemented. Shouldn't be called.
|
// Not implemented. Shouldn't be called.
|
||||||
Factor &operator=(const Factor &factor);
|
Factor &operator=(const Factor &factor);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
@ -33,7 +33,7 @@ FactorCollection FactorCollection::s_instance;
|
|||||||
|
|
||||||
const Factor *FactorCollection::AddFactor(const StringPiece &factorString)
|
const Factor *FactorCollection::AddFactor(const StringPiece &factorString)
|
||||||
{
|
{
|
||||||
// Sorry this is so complicated. Can't we just require everybody to use Boost >= 1.42? The issue is that I can't check BOOST_VERSION unless we have Boost.
|
// Sorry this is so complicated. Can't we just require everybody to use Boost >= 1.42? The issue is that I can't check BOOST_VERSION unless we have Boost.
|
||||||
#ifdef WITH_THREADS
|
#ifdef WITH_THREADS
|
||||||
#if BOOST_VERSION < 104200
|
#if BOOST_VERSION < 104200
|
||||||
FactorFriend to_ins;
|
FactorFriend to_ins;
|
||||||
@ -42,7 +42,7 @@ const Factor *FactorCollection::AddFactor(const StringPiece &factorString)
|
|||||||
{
|
{
|
||||||
boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
|
boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
|
||||||
#if BOOST_VERSION >= 104200
|
#if BOOST_VERSION >= 104200
|
||||||
// If this line doesn't compile, upgrade your Boost.
|
// If this line doesn't compile, upgrade your Boost.
|
||||||
Set::const_iterator i = m_set.find(factorString, HashFactor(), EqualsFactor());
|
Set::const_iterator i = m_set.find(factorString, HashFactor(), EqualsFactor());
|
||||||
#else // BOOST_VERSION
|
#else // BOOST_VERSION
|
||||||
Set::const_iterator i = m_set.find(to_ins);
|
Set::const_iterator i = m_set.find(to_ins);
|
||||||
|
@ -47,7 +47,7 @@ namespace Moses
|
|||||||
* private and friended to FactorFriend. The STL containers can delegate
|
* private and friended to FactorFriend. The STL containers can delegate
|
||||||
* copying, so friending the container isn't sufficient. STL containers see
|
* copying, so friending the container isn't sufficient. STL containers see
|
||||||
* FactorFriend's public copy constructor and everybody else sees Factor's
|
* FactorFriend's public copy constructor and everybody else sees Factor's
|
||||||
* private copy constructor.
|
* private copy constructor.
|
||||||
*/
|
*/
|
||||||
struct FactorFriend {
|
struct FactorFriend {
|
||||||
Factor in;
|
Factor in;
|
||||||
|
@ -30,20 +30,24 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
namespace Moses {
|
namespace Moses
|
||||||
|
{
|
||||||
|
|
||||||
LanguageModel::LanguageModel() {
|
LanguageModel::LanguageModel()
|
||||||
m_enableOOVFeature = StaticData::Instance().GetLMEnableOOVFeature();
|
{
|
||||||
|
m_enableOOVFeature = StaticData::Instance().GetLMEnableOOVFeature();
|
||||||
}
|
}
|
||||||
|
|
||||||
void LanguageModel::Init(ScoreIndexManager &scoreIndexManager) {
|
void LanguageModel::Init(ScoreIndexManager &scoreIndexManager)
|
||||||
|
{
|
||||||
scoreIndexManager.AddScoreProducer(this);
|
scoreIndexManager.AddScoreProducer(this);
|
||||||
}
|
}
|
||||||
|
|
||||||
LanguageModel::~LanguageModel() {}
|
LanguageModel::~LanguageModel() {}
|
||||||
|
|
||||||
// don't inline virtual funcs...
|
// don't inline virtual funcs...
|
||||||
size_t LanguageModel::GetNumScoreComponents() const {
|
size_t LanguageModel::GetNumScoreComponents() const
|
||||||
|
{
|
||||||
if (m_enableOOVFeature) {
|
if (m_enableOOVFeature) {
|
||||||
return 2;
|
return 2;
|
||||||
} else {
|
} else {
|
||||||
@ -51,13 +55,15 @@ size_t LanguageModel::GetNumScoreComponents() const {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
float LanguageModel::GetWeight() const {
|
float LanguageModel::GetWeight() const
|
||||||
|
{
|
||||||
size_t lmIndex = StaticData::Instance().GetScoreIndexManager().
|
size_t lmIndex = StaticData::Instance().GetScoreIndexManager().
|
||||||
GetBeginIndex(GetScoreBookkeepingID());
|
GetBeginIndex(GetScoreBookkeepingID());
|
||||||
return StaticData::Instance().GetAllWeights()[lmIndex];
|
return StaticData::Instance().GetAllWeights()[lmIndex];
|
||||||
}
|
}
|
||||||
|
|
||||||
float LanguageModel::GetOOVWeight() const {
|
float LanguageModel::GetOOVWeight() const
|
||||||
|
{
|
||||||
if (!m_enableOOVFeature) return 0;
|
if (!m_enableOOVFeature) return 0;
|
||||||
size_t lmIndex = StaticData::Instance().GetScoreIndexManager().
|
size_t lmIndex = StaticData::Instance().GetScoreIndexManager().
|
||||||
GetBeginIndex(GetScoreBookkeepingID());
|
GetBeginIndex(GetScoreBookkeepingID());
|
||||||
|
@ -35,7 +35,8 @@ class Phrase;
|
|||||||
class ScoreIndexManager;
|
class ScoreIndexManager;
|
||||||
|
|
||||||
//! Abstract base class which represent a language model on a contiguous phrase
|
//! Abstract base class which represent a language model on a contiguous phrase
|
||||||
class LanguageModel : public StatefulFeatureFunction {
|
class LanguageModel : public StatefulFeatureFunction
|
||||||
|
{
|
||||||
protected:
|
protected:
|
||||||
LanguageModel();
|
LanguageModel();
|
||||||
|
|
||||||
@ -43,11 +44,11 @@ protected:
|
|||||||
void Init(ScoreIndexManager &scoreIndexManager);
|
void Init(ScoreIndexManager &scoreIndexManager);
|
||||||
|
|
||||||
bool m_enableOOVFeature;
|
bool m_enableOOVFeature;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
virtual ~LanguageModel();
|
virtual ~LanguageModel();
|
||||||
|
|
||||||
// Make another feature without copying the underlying model data.
|
// Make another feature without copying the underlying model data.
|
||||||
virtual LanguageModel *Duplicate(ScoreIndexManager &scoreIndexManager) const = 0;
|
virtual LanguageModel *Duplicate(ScoreIndexManager &scoreIndexManager) const = 0;
|
||||||
|
|
||||||
//! see ScoreProducer.h
|
//! see ScoreProducer.h
|
||||||
|
@ -10,10 +10,12 @@
|
|||||||
namespace Moses
|
namespace Moses
|
||||||
{
|
{
|
||||||
|
|
||||||
LanguageModelDMapLM::LanguageModelDMapLM() : m_lm(0) {
|
LanguageModelDMapLM::LanguageModelDMapLM() : m_lm(0)
|
||||||
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
LanguageModelDMapLM::~LanguageModelDMapLM() {
|
LanguageModelDMapLM::~LanguageModelDMapLM()
|
||||||
|
{
|
||||||
delete m_lm;
|
delete m_lm;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -51,8 +53,8 @@ void LanguageModelDMapLM::CreateFactor(FactorCollection& factorCollection)
|
|||||||
}
|
}
|
||||||
|
|
||||||
LMResult LanguageModelDMapLM::GetValueGivenState(
|
LMResult LanguageModelDMapLM::GetValueGivenState(
|
||||||
const std::vector<const Word*>& contextFactor,
|
const std::vector<const Word*>& contextFactor,
|
||||||
FFState& state) const
|
FFState& state) const
|
||||||
{
|
{
|
||||||
DMapLMState& cast_state = static_cast<DMapLMState&>(state);
|
DMapLMState& cast_state = static_cast<DMapLMState&>(state);
|
||||||
LMResult result;
|
LMResult result;
|
||||||
@ -65,8 +67,8 @@ LMResult LanguageModelDMapLM::GetValueGivenState(
|
|||||||
}
|
}
|
||||||
|
|
||||||
LMResult LanguageModelDMapLM::GetValueForgotState(
|
LMResult LanguageModelDMapLM::GetValueForgotState(
|
||||||
const std::vector<const Word*>& contextFactor,
|
const std::vector<const Word*>& contextFactor,
|
||||||
FFState& outState) const
|
FFState& outState) const
|
||||||
{
|
{
|
||||||
DMapLMState& cast_state = static_cast<DMapLMState&>(outState);
|
DMapLMState& cast_state = static_cast<DMapLMState&>(outState);
|
||||||
LMResult result;
|
LMResult result;
|
||||||
@ -78,13 +80,13 @@ LMResult LanguageModelDMapLM::GetValueForgotState(
|
|||||||
}
|
}
|
||||||
|
|
||||||
float LanguageModelDMapLM::GetValue(
|
float LanguageModelDMapLM::GetValue(
|
||||||
const std::vector<const Word*>& contextFactor,
|
const std::vector<const Word*>& contextFactor,
|
||||||
size_t target_order,
|
size_t target_order,
|
||||||
size_t* succeeding_order) const
|
size_t* succeeding_order) const
|
||||||
{
|
{
|
||||||
FactorType factorType = GetFactorType();
|
FactorType factorType = GetFactorType();
|
||||||
float score;
|
float score;
|
||||||
|
|
||||||
std::string ngram_string("");
|
std::string ngram_string("");
|
||||||
ngram_string.append(((*contextFactor[0])[factorType])->GetString());
|
ngram_string.append(((*contextFactor[0])[factorType])->GetString());
|
||||||
for (size_t i = 1; i < contextFactor.size(); ++i) {
|
for (size_t i = 1; i < contextFactor.size(); ++i) {
|
||||||
@ -97,38 +99,44 @@ float LanguageModelDMapLM::GetValue(
|
|||||||
return score;
|
return score;
|
||||||
}
|
}
|
||||||
|
|
||||||
const FFState* LanguageModelDMapLM::GetNullContextState() const {
|
const FFState* LanguageModelDMapLM::GetNullContextState() const
|
||||||
DMapLMState* state = new DMapLMState();
|
{
|
||||||
state->m_last_succeeding_order = GetNGramOrder();
|
DMapLMState* state = new DMapLMState();
|
||||||
return state;
|
state->m_last_succeeding_order = GetNGramOrder();
|
||||||
|
return state;
|
||||||
}
|
}
|
||||||
|
|
||||||
FFState* LanguageModelDMapLM::GetNewSentenceState() const {
|
FFState* LanguageModelDMapLM::GetNewSentenceState() const
|
||||||
DMapLMState* state = new DMapLMState();
|
{
|
||||||
state->m_last_succeeding_order = GetNGramOrder();
|
DMapLMState* state = new DMapLMState();
|
||||||
return state;
|
state->m_last_succeeding_order = GetNGramOrder();
|
||||||
|
return state;
|
||||||
}
|
}
|
||||||
|
|
||||||
const FFState* LanguageModelDMapLM::GetBeginSentenceState() const {
|
const FFState* LanguageModelDMapLM::GetBeginSentenceState() const
|
||||||
DMapLMState* state = new DMapLMState();
|
{
|
||||||
state->m_last_succeeding_order = GetNGramOrder();
|
DMapLMState* state = new DMapLMState();
|
||||||
return state;
|
state->m_last_succeeding_order = GetNGramOrder();
|
||||||
|
return state;
|
||||||
}
|
}
|
||||||
|
|
||||||
FFState* LanguageModelDMapLM::NewState(const FFState* state) const {
|
FFState* LanguageModelDMapLM::NewState(const FFState* state) const
|
||||||
DMapLMState* new_state = new DMapLMState();
|
{
|
||||||
const DMapLMState* cast_state = static_cast<const DMapLMState*>(state);
|
DMapLMState* new_state = new DMapLMState();
|
||||||
new_state->m_last_succeeding_order = cast_state->m_last_succeeding_order;
|
const DMapLMState* cast_state = static_cast<const DMapLMState*>(state);
|
||||||
return new_state;
|
new_state->m_last_succeeding_order = cast_state->m_last_succeeding_order;
|
||||||
|
return new_state;
|
||||||
}
|
}
|
||||||
|
|
||||||
void LanguageModelDMapLM::CleanUpAfterSentenceProcessing() {
|
void LanguageModelDMapLM::CleanUpAfterSentenceProcessing()
|
||||||
|
{
|
||||||
m_lm->printStats();
|
m_lm->printStats();
|
||||||
m_lm->resetStats();
|
m_lm->resetStats();
|
||||||
m_lm->clearCaches();
|
m_lm->clearCaches();
|
||||||
}
|
}
|
||||||
|
|
||||||
void LanguageModelDMapLM::InitializeBeforeSentenceProcessing() {
|
void LanguageModelDMapLM::InitializeBeforeSentenceProcessing()
|
||||||
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace Moses
|
} // namespace Moses
|
||||||
|
@ -12,20 +12,22 @@
|
|||||||
#include "LM/SingleFactor.h"
|
#include "LM/SingleFactor.h"
|
||||||
#include "Util.h"
|
#include "Util.h"
|
||||||
|
|
||||||
namespace Moses {
|
namespace Moses
|
||||||
|
{
|
||||||
|
|
||||||
class DMapLMState : public FFState {
|
class DMapLMState : public FFState
|
||||||
|
{
|
||||||
public:
|
public:
|
||||||
int Compare(const FFState &o) const {
|
int Compare(const FFState &o) const {
|
||||||
const DMapLMState& cast_other = static_cast<const DMapLMState&>(o);
|
const DMapLMState& cast_other = static_cast<const DMapLMState&>(o);
|
||||||
if (cast_other.m_last_succeeding_order < m_last_succeeding_order)
|
if (cast_other.m_last_succeeding_order < m_last_succeeding_order)
|
||||||
return -1;
|
return -1;
|
||||||
else if (cast_other.m_last_succeeding_order > m_last_succeeding_order)
|
else if (cast_other.m_last_succeeding_order > m_last_succeeding_order)
|
||||||
return 1;
|
return 1;
|
||||||
else
|
else
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
uint8_t m_last_succeeding_order;
|
uint8_t m_last_succeeding_order;
|
||||||
};
|
};
|
||||||
|
|
||||||
class LanguageModelDMapLM : public LanguageModelSingleFactor
|
class LanguageModelDMapLM : public LanguageModelSingleFactor
|
||||||
|
@ -69,7 +69,7 @@ bool LanguageModelIRST::Load(const std::string &filePath,
|
|||||||
m_filePath = filePath;
|
m_filePath = filePath;
|
||||||
|
|
||||||
|
|
||||||
m_lmtb = m_lmtb->CreateLanguageModel(m_filePath);
|
m_lmtb = m_lmtb->CreateLanguageModel(m_filePath);
|
||||||
m_lmtb->setMaxLoadedLevel(1000);
|
m_lmtb->setMaxLoadedLevel(1000);
|
||||||
m_lmtb->load(m_filePath);
|
m_lmtb->load(m_filePath);
|
||||||
d=m_lmtb->getDict();
|
d=m_lmtb->getDict();
|
||||||
@ -140,7 +140,7 @@ int LanguageModelIRST::GetLmID( const std::string &str ) const
|
|||||||
}
|
}
|
||||||
|
|
||||||
int LanguageModelIRST::GetLmID( const Factor *factor ) const
|
int LanguageModelIRST::GetLmID( const Factor *factor ) const
|
||||||
{
|
{
|
||||||
size_t factorId = factor->GetId();
|
size_t factorId = factor->GetId();
|
||||||
|
|
||||||
if ((factorId >= m_lmIdLookup.size()) || (m_lmIdLookup[factorId] == m_empty)) {
|
if ((factorId >= m_lmIdLookup.size()) || (m_lmIdLookup[factorId] == m_empty)) {
|
||||||
@ -150,12 +150,12 @@ int LanguageModelIRST::GetLmID( const Factor *factor ) const
|
|||||||
|
|
||||||
//////////
|
//////////
|
||||||
///poiche' non c'e' distinzione tra i factorIDs delle parole sorgenti
|
///poiche' non c'e' distinzione tra i factorIDs delle parole sorgenti
|
||||||
///e delle parole target in Moses, puo' accadere che una parola target
|
///e delle parole target in Moses, puo' accadere che una parola target
|
||||||
///di cui non sia stato ancora calcolato il suo codice target abbia
|
///di cui non sia stato ancora calcolato il suo codice target abbia
|
||||||
///comunque un factorID noto (e quindi minore di m_lmIdLookup.size())
|
///comunque un factorID noto (e quindi minore di m_lmIdLookup.size())
|
||||||
///E' necessario dunque identificare questi casi di indeterminatezza
|
///E' necessario dunque identificare questi casi di indeterminatezza
|
||||||
///del codice target. Attualamente, questo controllo e' stato implementato
|
///del codice target. Attualamente, questo controllo e' stato implementato
|
||||||
///impostando a m_empty tutti i termini che non hanno ancora
|
///impostando a m_empty tutti i termini che non hanno ancora
|
||||||
//ricevuto un codice target effettivo
|
//ricevuto un codice target effettivo
|
||||||
///////////
|
///////////
|
||||||
|
|
||||||
@ -167,7 +167,7 @@ int LanguageModelIRST::GetLmID( const Factor *factor ) const
|
|||||||
/// IN POSIZIONE (factorID-1) invece che in posizione factrID dove dopo andiamo a leggerlo (vedi caso C
|
/// IN POSIZIONE (factorID-1) invece che in posizione factrID dove dopo andiamo a leggerlo (vedi caso C
|
||||||
/// Cosi' funziona ....
|
/// Cosi' funziona ....
|
||||||
/// ho un dubbio su cosa c'e' nelle prime posizioni di m_lmIdLookup
|
/// ho un dubbio su cosa c'e' nelle prime posizioni di m_lmIdLookup
|
||||||
/// quindi
|
/// quindi
|
||||||
/// e scopro che rimane vuota una entry ogni due
|
/// e scopro che rimane vuota una entry ogni due
|
||||||
/// perche' factorID cresce di due in due (perche' codifica sia source che target) "vuota" la posizione (factorID-1)
|
/// perche' factorID cresce di due in due (perche' codifica sia source che target) "vuota" la posizione (factorID-1)
|
||||||
/// non da problemi di correttezza, ma solo di "spreco" di memoria
|
/// non da problemi di correttezza, ma solo di "spreco" di memoria
|
||||||
@ -177,10 +177,10 @@ int LanguageModelIRST::GetLmID( const Factor *factor ) const
|
|||||||
////////////////
|
////////////////
|
||||||
|
|
||||||
|
|
||||||
if (factorId >= m_lmIdLookup.size()){
|
if (factorId >= m_lmIdLookup.size()) {
|
||||||
//resize and fill with m_empty
|
//resize and fill with m_empty
|
||||||
//increment the array more than needed to avoid too many resizing operation.
|
//increment the array more than needed to avoid too many resizing operation.
|
||||||
m_lmIdLookup.resize(factorId+10, m_empty);
|
m_lmIdLookup.resize(factorId+10, m_empty);
|
||||||
}
|
}
|
||||||
|
|
||||||
//insert new code
|
//insert new code
|
||||||
|
@ -68,8 +68,9 @@ void LanguageModelImplementation::GetState(
|
|||||||
GetValueForgotState(contextFactor, state);
|
GetValueForgotState(contextFactor, state);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calculate score of a phrase.
|
// Calculate score of a phrase.
|
||||||
void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const {
|
void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const
|
||||||
|
{
|
||||||
fullScore = 0;
|
fullScore = 0;
|
||||||
ngramScore = 0;
|
ngramScore = 0;
|
||||||
|
|
||||||
@ -81,7 +82,7 @@ void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullSco
|
|||||||
vector<const Word*> contextFactor;
|
vector<const Word*> contextFactor;
|
||||||
contextFactor.reserve(GetNGramOrder());
|
contextFactor.reserve(GetNGramOrder());
|
||||||
std::auto_ptr<FFState> state(NewState((phrase.GetWord(0) == GetSentenceStartArray()) ?
|
std::auto_ptr<FFState> state(NewState((phrase.GetWord(0) == GetSentenceStartArray()) ?
|
||||||
GetBeginSentenceState() : GetNullContextState()));
|
GetBeginSentenceState() : GetNullContextState()));
|
||||||
size_t currPos = 0;
|
size_t currPos = 0;
|
||||||
while (currPos < phraseSize) {
|
while (currPos < phraseSize) {
|
||||||
const Word &word = phrase.GetWord(currPos);
|
const Word &word = phrase.GetWord(currPos);
|
||||||
@ -108,7 +109,7 @@ void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullSco
|
|||||||
fullScore += result.score;
|
fullScore += result.score;
|
||||||
if (contextFactor.size() == GetNGramOrder())
|
if (contextFactor.size() == GetNGramOrder())
|
||||||
ngramScore += result.score;
|
ngramScore += result.score;
|
||||||
if (result.unknown) ++oovCount;
|
if (result.unknown) ++oovCount;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -116,7 +117,8 @@ void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullSco
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
FFState *LanguageModelImplementation::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out, const LanguageModel *feature) const {
|
FFState *LanguageModelImplementation::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out, const LanguageModel *feature) const
|
||||||
|
{
|
||||||
// In this function, we only compute the LM scores of n-grams that overlap a
|
// In this function, we only compute the LM scores of n-grams that overlap a
|
||||||
// phrase boundary. Phrase-internal scores are taken directly from the
|
// phrase boundary. Phrase-internal scores are taken directly from the
|
||||||
// translation option.
|
// translation option.
|
||||||
@ -178,9 +180,7 @@ FFState *LanguageModelImplementation::Evaluate(const Hypothesis &hypo, const FFS
|
|||||||
contextFactor[i] = &hypo.GetWord((size_t)currPos);
|
contextFactor[i] = &hypo.GetWord((size_t)currPos);
|
||||||
}
|
}
|
||||||
lmScore += GetValueForgotState(contextFactor, *res).score;
|
lmScore += GetValueForgotState(contextFactor, *res).score;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
if (endPos < currEndPos) {
|
if (endPos < currEndPos) {
|
||||||
//need to get the LM state (otherwise the last LM state is fine)
|
//need to get the LM state (otherwise the last LM state is fine)
|
||||||
for (size_t currPos = endPos+1; currPos <= currEndPos; currPos++) {
|
for (size_t currPos = endPos+1; currPos <= currEndPos; currPos++) {
|
||||||
@ -207,10 +207,11 @@ FFState *LanguageModelImplementation::Evaluate(const Hypothesis &hypo, const FFS
|
|||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace {
|
namespace
|
||||||
|
{
|
||||||
|
|
||||||
// This is the FFState used by LanguageModelImplementation::EvaluateChart.
|
// This is the FFState used by LanguageModelImplementation::EvaluateChart.
|
||||||
// Though svn blame goes back to heafield, don't blame me. I just moved this from LanguageModelChartState.cpp and ChartHypothesis.cpp.
|
// Though svn blame goes back to heafield, don't blame me. I just moved this from LanguageModelChartState.cpp and ChartHypothesis.cpp.
|
||||||
class LanguageModelChartState : public FFState
|
class LanguageModelChartState : public FFState
|
||||||
{
|
{
|
||||||
private:
|
private:
|
||||||
@ -223,12 +224,11 @@ private:
|
|||||||
|
|
||||||
const ChartHypothesis &m_hypo;
|
const ChartHypothesis &m_hypo;
|
||||||
|
|
||||||
/** Construct the prefix string of up to specified size
|
/** Construct the prefix string of up to specified size
|
||||||
* \param ret prefix string
|
* \param ret prefix string
|
||||||
* \param size maximum size (typically max lm context window)
|
* \param size maximum size (typically max lm context window)
|
||||||
*/
|
*/
|
||||||
size_t CalcPrefix(const ChartHypothesis &hypo, int featureID, Phrase &ret, size_t size) const
|
size_t CalcPrefix(const ChartHypothesis &hypo, int featureID, Phrase &ret, size_t size) const {
|
||||||
{
|
|
||||||
const TargetPhrase &target = hypo.GetCurrTargetPhrase();
|
const TargetPhrase &target = hypo.GetCurrTargetPhrase();
|
||||||
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
|
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
|
||||||
target.GetAlignmentInfo().GetNonTermIndexMap();
|
target.GetAlignmentInfo().GetNonTermIndexMap();
|
||||||
@ -257,13 +257,12 @@ private:
|
|||||||
return size;
|
return size;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Construct the suffix phrase of up to specified size
|
/** Construct the suffix phrase of up to specified size
|
||||||
* will always be called after the construction of prefix phrase
|
* will always be called after the construction of prefix phrase
|
||||||
* \param ret suffix phrase
|
* \param ret suffix phrase
|
||||||
* \param size maximum size of suffix
|
* \param size maximum size of suffix
|
||||||
*/
|
*/
|
||||||
size_t CalcSuffix(const ChartHypothesis &hypo, int featureID, Phrase &ret, size_t size) const
|
size_t CalcSuffix(const ChartHypothesis &hypo, int featureID, Phrase &ret, size_t size) const {
|
||||||
{
|
|
||||||
CHECK(m_contextPrefix.GetSize() <= m_numTargetTerminals);
|
CHECK(m_contextPrefix.GetSize() <= m_numTargetTerminals);
|
||||||
|
|
||||||
// special handling for small hypotheses
|
// special handling for small hypotheses
|
||||||
@ -292,8 +291,7 @@ private:
|
|||||||
size_t nonTermInd = nonTermIndexMap[pos];
|
size_t nonTermInd = nonTermIndexMap[pos];
|
||||||
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermInd);
|
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermInd);
|
||||||
size = static_cast<const LanguageModelChartState*>(prevHypo->GetFFState(featureID))->CalcSuffix(*prevHypo, featureID, ret, size);
|
size = static_cast<const LanguageModelChartState*>(prevHypo->GetFFState(featureID))->CalcSuffix(*prevHypo, featureID, ret, size);
|
||||||
}
|
} else {
|
||||||
else {
|
|
||||||
ret.PrependWord(hypo.GetCurrTargetPhrase().GetWord(pos));
|
ret.PrependWord(hypo.GetCurrTargetPhrase().GetWord(pos));
|
||||||
size--;
|
size--;
|
||||||
}
|
}
|
||||||
@ -309,11 +307,10 @@ private:
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
LanguageModelChartState(const ChartHypothesis &hypo, int featureID, size_t order)
|
LanguageModelChartState(const ChartHypothesis &hypo, int featureID, size_t order)
|
||||||
:m_lmRightContext(NULL)
|
:m_lmRightContext(NULL)
|
||||||
,m_contextPrefix(order - 1)
|
,m_contextPrefix(order - 1)
|
||||||
,m_contextSuffix( order - 1)
|
,m_contextSuffix( order - 1)
|
||||||
,m_hypo(hypo)
|
,m_hypo(hypo) {
|
||||||
{
|
|
||||||
m_numTargetTerminals = hypo.GetCurrTargetPhrase().GetNumTerminals();
|
m_numTargetTerminals = hypo.GetCurrTargetPhrase().GetNumTerminals();
|
||||||
|
|
||||||
for (std::vector<const ChartHypothesis*>::const_iterator i = hypo.GetPrevHypos().begin(); i != hypo.GetPrevHypos().end(); ++i) {
|
for (std::vector<const ChartHypothesis*>::const_iterator i = hypo.GetPrevHypos().begin(); i != hypo.GetPrevHypos().end(); ++i) {
|
||||||
@ -334,8 +331,12 @@ public:
|
|||||||
m_lmRightContext = rightState;
|
m_lmRightContext = rightState;
|
||||||
}
|
}
|
||||||
|
|
||||||
float GetPrefixScore() const { return m_prefixScore; }
|
float GetPrefixScore() const {
|
||||||
FFState* GetRightContext() const { return m_lmRightContext; }
|
return m_prefixScore;
|
||||||
|
}
|
||||||
|
FFState* GetRightContext() const {
|
||||||
|
return m_lmRightContext;
|
||||||
|
}
|
||||||
|
|
||||||
size_t GetNumTargetTerminals() const {
|
size_t GetNumTargetTerminals() const {
|
||||||
return m_numTargetTerminals;
|
return m_numTargetTerminals;
|
||||||
@ -353,8 +354,7 @@ public:
|
|||||||
dynamic_cast<const LanguageModelChartState &>( o );
|
dynamic_cast<const LanguageModelChartState &>( o );
|
||||||
|
|
||||||
// prefix
|
// prefix
|
||||||
if (m_hypo.GetCurrSourceRange().GetStartPos() > 0) // not for "<s> ..."
|
if (m_hypo.GetCurrSourceRange().GetStartPos() > 0) { // not for "<s> ..."
|
||||||
{
|
|
||||||
int ret = GetPrefix().Compare(other.GetPrefix());
|
int ret = GetPrefix().Compare(other.GetPrefix());
|
||||||
if (ret != 0)
|
if (ret != 0)
|
||||||
return ret;
|
return ret;
|
||||||
@ -362,8 +362,7 @@ public:
|
|||||||
|
|
||||||
// suffix
|
// suffix
|
||||||
size_t inputSize = m_hypo.GetManager().GetSource().GetSize();
|
size_t inputSize = m_hypo.GetManager().GetSource().GetSize();
|
||||||
if (m_hypo.GetCurrSourceRange().GetEndPos() < inputSize - 1)// not for "... </s>"
|
if (m_hypo.GetCurrSourceRange().GetEndPos() < inputSize - 1) { // not for "... </s>"
|
||||||
{
|
|
||||||
int ret = other.GetRightContext()->Compare(*m_lmRightContext);
|
int ret = other.GetRightContext()->Compare(*m_lmRightContext);
|
||||||
if (ret != 0)
|
if (ret != 0)
|
||||||
return ret;
|
return ret;
|
||||||
@ -374,7 +373,8 @@ public:
|
|||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection* out, const LanguageModel *scorer) const {
|
FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection* out, const LanguageModel *scorer) const
|
||||||
|
{
|
||||||
LanguageModelChartState *ret = new LanguageModelChartState(hypo, featureID, GetNGramOrder());
|
LanguageModelChartState *ret = new LanguageModelChartState(hypo, featureID, GetNGramOrder());
|
||||||
// data structure for factored context phrase (history and predicted word)
|
// data structure for factored context phrase (history and predicted word)
|
||||||
vector<const Word*> contextFactor;
|
vector<const Word*> contextFactor;
|
||||||
@ -394,33 +394,28 @@ FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo,
|
|||||||
// loop over rule
|
// loop over rule
|
||||||
for (size_t phrasePos = 0, wordPos = 0;
|
for (size_t phrasePos = 0, wordPos = 0;
|
||||||
phrasePos < hypo.GetCurrTargetPhrase().GetSize();
|
phrasePos < hypo.GetCurrTargetPhrase().GetSize();
|
||||||
phrasePos++)
|
phrasePos++) {
|
||||||
{
|
|
||||||
// consult rule for either word or non-terminal
|
// consult rule for either word or non-terminal
|
||||||
const Word &word = hypo.GetCurrTargetPhrase().GetWord(phrasePos);
|
const Word &word = hypo.GetCurrTargetPhrase().GetWord(phrasePos);
|
||||||
|
|
||||||
// regular word
|
// regular word
|
||||||
if (!word.IsNonTerminal())
|
if (!word.IsNonTerminal()) {
|
||||||
{
|
|
||||||
ShiftOrPush(contextFactor, word);
|
ShiftOrPush(contextFactor, word);
|
||||||
|
|
||||||
// beginning of sentence symbol <s>? -> just update state
|
// beginning of sentence symbol <s>? -> just update state
|
||||||
if (word == GetSentenceStartArray())
|
if (word == GetSentenceStartArray()) {
|
||||||
{
|
|
||||||
CHECK(phrasePos == 0);
|
CHECK(phrasePos == 0);
|
||||||
delete lmState;
|
delete lmState;
|
||||||
lmState = NewState( GetBeginSentenceState() );
|
lmState = NewState( GetBeginSentenceState() );
|
||||||
}
|
}
|
||||||
// score a regular word added by the rule
|
// score a regular word added by the rule
|
||||||
else
|
else {
|
||||||
{
|
|
||||||
updateChartScore( &prefixScore, &finalizedScore, UntransformLMScore(GetValueGivenState(contextFactor, *lmState).score), ++wordPos );
|
updateChartScore( &prefixScore, &finalizedScore, UntransformLMScore(GetValueGivenState(contextFactor, *lmState).score), ++wordPos );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// non-terminal, add phrase from underlying hypothesis
|
// non-terminal, add phrase from underlying hypothesis
|
||||||
else
|
else {
|
||||||
{
|
|
||||||
// look up underlying hypothesis
|
// look up underlying hypothesis
|
||||||
size_t nonTermIndex = nonTermIndexMap[phrasePos];
|
size_t nonTermIndex = nonTermIndexMap[phrasePos];
|
||||||
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndex);
|
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndex);
|
||||||
@ -444,8 +439,7 @@ FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo,
|
|||||||
// push suffix
|
// push suffix
|
||||||
int suffixPos = prevState->GetSuffix().GetSize() - (GetNGramOrder()-1);
|
int suffixPos = prevState->GetSuffix().GetSize() - (GetNGramOrder()-1);
|
||||||
if (suffixPos < 0) suffixPos = 0; // push all words if less than order
|
if (suffixPos < 0) suffixPos = 0; // push all words if less than order
|
||||||
for(;(size_t)suffixPos < prevState->GetSuffix().GetSize(); suffixPos++)
|
for(; (size_t)suffixPos < prevState->GetSuffix().GetSize(); suffixPos++) {
|
||||||
{
|
|
||||||
const Word &word = prevState->GetSuffix().GetWord(suffixPos);
|
const Word &word = prevState->GetSuffix().GetWord(suffixPos);
|
||||||
ShiftOrPush(contextFactor, word);
|
ShiftOrPush(contextFactor, word);
|
||||||
wordPos++;
|
wordPos++;
|
||||||
@ -453,22 +447,19 @@ FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// internal non-terminal
|
// internal non-terminal
|
||||||
else
|
else {
|
||||||
{
|
|
||||||
// score its prefix
|
// score its prefix
|
||||||
for(size_t prefixPos = 0;
|
for(size_t prefixPos = 0;
|
||||||
prefixPos < GetNGramOrder()-1 // up to LM order window
|
prefixPos < GetNGramOrder()-1 // up to LM order window
|
||||||
&& prefixPos < subPhraseLength; // up to length
|
&& prefixPos < subPhraseLength; // up to length
|
||||||
prefixPos++)
|
prefixPos++) {
|
||||||
{
|
|
||||||
const Word &word = prevState->GetPrefix().GetWord(prefixPos);
|
const Word &word = prevState->GetPrefix().GetWord(prefixPos);
|
||||||
ShiftOrPush(contextFactor, word);
|
ShiftOrPush(contextFactor, word);
|
||||||
updateChartScore( &prefixScore, &finalizedScore, UntransformLMScore(GetValueGivenState(contextFactor, *lmState).score), ++wordPos );
|
updateChartScore( &prefixScore, &finalizedScore, UntransformLMScore(GetValueGivenState(contextFactor, *lmState).score), ++wordPos );
|
||||||
}
|
}
|
||||||
|
|
||||||
// check if we are dealing with a large sub-phrase
|
// check if we are dealing with a large sub-phrase
|
||||||
if (subPhraseLength > GetNGramOrder() - 1)
|
if (subPhraseLength > GetNGramOrder() - 1) {
|
||||||
{
|
|
||||||
// add its finalized language model score
|
// add its finalized language model score
|
||||||
finalizedScore +=
|
finalizedScore +=
|
||||||
prevHypo->GetScoreBreakdown().GetScoresForProducer(scorer)[0] // full score
|
prevHypo->GetScoreBreakdown().GetScoresForProducer(scorer)[0] // full score
|
||||||
@ -503,11 +494,11 @@ FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo,
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
void LanguageModelImplementation::updateChartScore(float *prefixScore, float *finalizedScore, float score, size_t wordPos) const {
|
void LanguageModelImplementation::updateChartScore(float *prefixScore, float *finalizedScore, float score, size_t wordPos) const
|
||||||
|
{
|
||||||
if (wordPos < GetNGramOrder()) {
|
if (wordPos < GetNGramOrder()) {
|
||||||
*prefixScore += score;
|
*prefixScore += score;
|
||||||
}
|
} else {
|
||||||
else {
|
|
||||||
*finalizedScore += score;
|
*finalizedScore += score;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -45,7 +45,7 @@ class Phrase;
|
|||||||
struct LMResult {
|
struct LMResult {
|
||||||
// log probability
|
// log probability
|
||||||
float score;
|
float score;
|
||||||
// Is the word unknown?
|
// Is the word unknown?
|
||||||
bool unknown;
|
bool unknown;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -126,54 +126,55 @@ public:
|
|||||||
virtual void CleanUpAfterSentenceProcessing() {};
|
virtual void CleanUpAfterSentenceProcessing() {};
|
||||||
};
|
};
|
||||||
|
|
||||||
class LMRefCount : public LanguageModel {
|
class LMRefCount : public LanguageModel
|
||||||
public:
|
{
|
||||||
LMRefCount(ScoreIndexManager &scoreIndexManager, LanguageModelImplementation *impl) : m_impl(impl) {
|
public:
|
||||||
Init(scoreIndexManager);
|
LMRefCount(ScoreIndexManager &scoreIndexManager, LanguageModelImplementation *impl) : m_impl(impl) {
|
||||||
}
|
Init(scoreIndexManager);
|
||||||
|
}
|
||||||
|
|
||||||
LanguageModel *Duplicate(ScoreIndexManager &scoreIndexManager) const {
|
LanguageModel *Duplicate(ScoreIndexManager &scoreIndexManager) const {
|
||||||
return new LMRefCount(scoreIndexManager, *this);
|
return new LMRefCount(scoreIndexManager, *this);
|
||||||
}
|
}
|
||||||
|
|
||||||
void InitializeBeforeSentenceProcessing() {
|
void InitializeBeforeSentenceProcessing() {
|
||||||
m_impl->InitializeBeforeSentenceProcessing();
|
m_impl->InitializeBeforeSentenceProcessing();
|
||||||
}
|
}
|
||||||
|
|
||||||
void CleanUpAfterSentenceProcessing() {
|
void CleanUpAfterSentenceProcessing() {
|
||||||
m_impl->CleanUpAfterSentenceProcessing();
|
m_impl->CleanUpAfterSentenceProcessing();
|
||||||
}
|
}
|
||||||
|
|
||||||
const FFState* EmptyHypothesisState(const InputType &/*input*/) const {
|
const FFState* EmptyHypothesisState(const InputType &/*input*/) const {
|
||||||
return m_impl->NewState(m_impl->GetBeginSentenceState());
|
return m_impl->NewState(m_impl->GetBeginSentenceState());
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Useable(const Phrase &phrase) const {
|
bool Useable(const Phrase &phrase) const {
|
||||||
return m_impl->Useable(phrase);
|
return m_impl->Useable(phrase);
|
||||||
}
|
}
|
||||||
|
|
||||||
void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const {
|
void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const {
|
||||||
return m_impl->CalcScore(phrase, fullScore, ngramScore, oovCount);
|
return m_impl->CalcScore(phrase, fullScore, ngramScore, oovCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
FFState* Evaluate(const Hypothesis& cur_hypo, const FFState* prev_state, ScoreComponentCollection* accumulator) const {
|
FFState* Evaluate(const Hypothesis& cur_hypo, const FFState* prev_state, ScoreComponentCollection* accumulator) const {
|
||||||
return m_impl->Evaluate(cur_hypo, prev_state, accumulator, this);
|
return m_impl->Evaluate(cur_hypo, prev_state, accumulator, this);
|
||||||
}
|
}
|
||||||
|
|
||||||
FFState* EvaluateChart(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection* accumulator) const {
|
FFState* EvaluateChart(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection* accumulator) const {
|
||||||
return m_impl->EvaluateChart(cur_hypo, featureID, accumulator, this);
|
return m_impl->EvaluateChart(cur_hypo, featureID, accumulator, this);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string GetScoreProducerDescription(unsigned int param) const {
|
std::string GetScoreProducerDescription(unsigned int param) const {
|
||||||
return m_impl->GetScoreProducerDescription(param);
|
return m_impl->GetScoreProducerDescription(param);
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
LMRefCount(ScoreIndexManager &scoreIndexManager, const LMRefCount ©_from) : m_impl(copy_from.m_impl) {
|
LMRefCount(ScoreIndexManager &scoreIndexManager, const LMRefCount ©_from) : m_impl(copy_from.m_impl) {
|
||||||
Init(scoreIndexManager);
|
Init(scoreIndexManager);
|
||||||
}
|
}
|
||||||
|
|
||||||
boost::shared_ptr<LanguageModelImplementation> m_impl;
|
boost::shared_ptr<LanguageModelImplementation> m_impl;
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -43,8 +43,10 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
namespace Moses {
|
namespace Moses
|
||||||
namespace {
|
{
|
||||||
|
namespace
|
||||||
|
{
|
||||||
|
|
||||||
struct KenLMState : public FFState {
|
struct KenLMState : public FFState {
|
||||||
lm::ngram::State state;
|
lm::ngram::State state;
|
||||||
@ -59,67 +61,69 @@ struct KenLMState : public FFState {
|
|||||||
/*
|
/*
|
||||||
* An implementation of single factor LM using Ken's code.
|
* An implementation of single factor LM using Ken's code.
|
||||||
*/
|
*/
|
||||||
template <class Model> class LanguageModelKen : public LanguageModel {
|
template <class Model> class LanguageModelKen : public LanguageModel
|
||||||
public:
|
{
|
||||||
LanguageModelKen(const std::string &file, ScoreIndexManager &manager, FactorType factorType, bool lazy);
|
public:
|
||||||
|
LanguageModelKen(const std::string &file, ScoreIndexManager &manager, FactorType factorType, bool lazy);
|
||||||
|
|
||||||
LanguageModel *Duplicate(ScoreIndexManager &scoreIndexManager) const;
|
LanguageModel *Duplicate(ScoreIndexManager &scoreIndexManager) const;
|
||||||
|
|
||||||
bool Useable(const Phrase &phrase) const {
|
bool Useable(const Phrase &phrase) const {
|
||||||
return (phrase.GetSize()>0 && phrase.GetFactor(0, m_factorType) != NULL);
|
return (phrase.GetSize()>0 && phrase.GetFactor(0, m_factorType) != NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string GetScoreProducerDescription(unsigned) const {
|
std::string GetScoreProducerDescription(unsigned) const {
|
||||||
std::ostringstream oss;
|
std::ostringstream oss;
|
||||||
oss << "LM_" << m_ngram->Order() << "gram";
|
oss << "LM_" << m_ngram->Order() << "gram";
|
||||||
return oss.str();
|
return oss.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
const FFState *EmptyHypothesisState(const InputType &/*input*/) const {
|
const FFState *EmptyHypothesisState(const InputType &/*input*/) const {
|
||||||
KenLMState *ret = new KenLMState();
|
KenLMState *ret = new KenLMState();
|
||||||
ret->state = m_ngram->BeginSentenceState();
|
ret->state = m_ngram->BeginSentenceState();
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const;
|
void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const;
|
||||||
|
|
||||||
FFState *Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const;
|
FFState *Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const;
|
||||||
|
|
||||||
FFState *EvaluateChart(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection *accumulator) const;
|
FFState *EvaluateChart(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection *accumulator) const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
LanguageModelKen(ScoreIndexManager &manager, const LanguageModelKen<Model> ©_from);
|
LanguageModelKen(ScoreIndexManager &manager, const LanguageModelKen<Model> ©_from);
|
||||||
|
|
||||||
lm::WordIndex TranslateID(const Word &word) const {
|
lm::WordIndex TranslateID(const Word &word) const {
|
||||||
std::size_t factor = word.GetFactor(m_factorType)->GetId();
|
std::size_t factor = word.GetFactor(m_factorType)->GetId();
|
||||||
return (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]);
|
return (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Convert last words of hypothesis into vocab ids, returning an end pointer.
|
// Convert last words of hypothesis into vocab ids, returning an end pointer.
|
||||||
lm::WordIndex *LastIDs(const Hypothesis &hypo, lm::WordIndex *indices) const {
|
lm::WordIndex *LastIDs(const Hypothesis &hypo, lm::WordIndex *indices) const {
|
||||||
lm::WordIndex *index = indices;
|
lm::WordIndex *index = indices;
|
||||||
lm::WordIndex *end = indices + m_ngram->Order() - 1;
|
lm::WordIndex *end = indices + m_ngram->Order() - 1;
|
||||||
int position = hypo.GetCurrTargetWordsRange().GetEndPos();
|
int position = hypo.GetCurrTargetWordsRange().GetEndPos();
|
||||||
for (; ; ++index, --position) {
|
for (; ; ++index, --position) {
|
||||||
if (position == -1) {
|
if (position == -1) {
|
||||||
*index = m_ngram->GetVocabulary().BeginSentence();
|
*index = m_ngram->GetVocabulary().BeginSentence();
|
||||||
return index + 1;
|
return index + 1;
|
||||||
}
|
|
||||||
if (index == end) return index;
|
|
||||||
*index = TranslateID(hypo.GetWord(position));
|
|
||||||
}
|
}
|
||||||
|
if (index == end) return index;
|
||||||
|
*index = TranslateID(hypo.GetWord(position));
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
boost::shared_ptr<Model> m_ngram;
|
boost::shared_ptr<Model> m_ngram;
|
||||||
|
|
||||||
std::vector<lm::WordIndex> m_lmIdLookup;
|
|
||||||
|
|
||||||
FactorType m_factorType;
|
std::vector<lm::WordIndex> m_lmIdLookup;
|
||||||
|
|
||||||
const Factor *m_beginSentenceFactor;
|
FactorType m_factorType;
|
||||||
|
|
||||||
|
const Factor *m_beginSentenceFactor;
|
||||||
};
|
};
|
||||||
|
|
||||||
class MappingBuilder : public lm::EnumerateVocab {
|
class MappingBuilder : public lm::EnumerateVocab
|
||||||
|
{
|
||||||
public:
|
public:
|
||||||
MappingBuilder(FactorCollection &factorCollection, std::vector<lm::WordIndex> &mapping)
|
MappingBuilder(FactorCollection &factorCollection, std::vector<lm::WordIndex> &mapping)
|
||||||
: m_factorCollection(factorCollection), m_mapping(mapping) {}
|
: m_factorCollection(factorCollection), m_mapping(mapping) {}
|
||||||
@ -138,11 +142,13 @@ private:
|
|||||||
std::vector<lm::WordIndex> &m_mapping;
|
std::vector<lm::WordIndex> &m_mapping;
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class Model> LanguageModelKen<Model>::LanguageModelKen(const std::string &file, ScoreIndexManager &manager, FactorType factorType, bool lazy) : m_factorType(factorType) {
|
template <class Model> LanguageModelKen<Model>::LanguageModelKen(const std::string &file, ScoreIndexManager &manager, FactorType factorType, bool lazy) : m_factorType(factorType)
|
||||||
|
{
|
||||||
lm::ngram::Config config;
|
lm::ngram::Config config;
|
||||||
IFVERBOSE(1) {
|
IFVERBOSE(1) {
|
||||||
config.messages = &std::cerr;
|
config.messages = &std::cerr;
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
config.messages = NULL;
|
config.messages = NULL;
|
||||||
}
|
}
|
||||||
FactorCollection &collection = FactorCollection::Instance();
|
FactorCollection &collection = FactorCollection::Instance();
|
||||||
@ -156,20 +162,23 @@ template <class Model> LanguageModelKen<Model>::LanguageModelKen(const std::stri
|
|||||||
Init(manager);
|
Init(manager);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Model> LanguageModel *LanguageModelKen<Model>::Duplicate(ScoreIndexManager &manager) const {
|
template <class Model> LanguageModel *LanguageModelKen<Model>::Duplicate(ScoreIndexManager &manager) const
|
||||||
|
{
|
||||||
return new LanguageModelKen<Model>(manager, *this);
|
return new LanguageModelKen<Model>(manager, *this);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Model> LanguageModelKen<Model>::LanguageModelKen(ScoreIndexManager &manager, const LanguageModelKen<Model> ©_from) :
|
template <class Model> LanguageModelKen<Model>::LanguageModelKen(ScoreIndexManager &manager, const LanguageModelKen<Model> ©_from) :
|
||||||
m_ngram(copy_from.m_ngram),
|
m_ngram(copy_from.m_ngram),
|
||||||
// TODO: don't copy this.
|
// TODO: don't copy this.
|
||||||
m_lmIdLookup(copy_from.m_lmIdLookup),
|
m_lmIdLookup(copy_from.m_lmIdLookup),
|
||||||
m_factorType(copy_from.m_factorType),
|
m_factorType(copy_from.m_factorType),
|
||||||
m_beginSentenceFactor(copy_from.m_beginSentenceFactor) {
|
m_beginSentenceFactor(copy_from.m_beginSentenceFactor)
|
||||||
|
{
|
||||||
Init(manager);
|
Init(manager);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Model> void LanguageModelKen<Model>::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const {
|
template <class Model> void LanguageModelKen<Model>::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const
|
||||||
|
{
|
||||||
fullScore = 0;
|
fullScore = 0;
|
||||||
ngramScore = 0;
|
ngramScore = 0;
|
||||||
oovCount = 0;
|
oovCount = 0;
|
||||||
@ -186,13 +195,13 @@ template <class Model> void LanguageModelKen<Model>::CalcScore(const Phrase &phr
|
|||||||
*state0 = m_ngram->NullContextState();
|
*state0 = m_ngram->NullContextState();
|
||||||
position = 0;
|
position = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ngramBoundary = m_ngram->Order() - 1;
|
size_t ngramBoundary = m_ngram->Order() - 1;
|
||||||
|
|
||||||
for (; position < phrase.GetSize(); ++position) {
|
for (; position < phrase.GetSize(); ++position) {
|
||||||
const Word &word = phrase.GetWord(position);
|
const Word &word = phrase.GetWord(position);
|
||||||
if (word.IsNonTerminal()) {
|
if (word.IsNonTerminal()) {
|
||||||
// If there's a non-terminal at 1 and we have a 5-gram LM, then positions 2 3 4 and 5 will be incomplete while position 6 is complete.
|
// If there's a non-terminal at 1 and we have a 5-gram LM, then positions 2 3 4 and 5 will be incomplete while position 6 is complete.
|
||||||
ngramBoundary = m_ngram->Order() + position;
|
ngramBoundary = m_ngram->Order() + position;
|
||||||
*state0 = m_ngram->NullContextState();
|
*state0 = m_ngram->NullContextState();
|
||||||
} else {
|
} else {
|
||||||
@ -210,11 +219,12 @@ template <class Model> void LanguageModelKen<Model>::CalcScore(const Phrase &phr
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Model> FFState *LanguageModelKen<Model>::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const {
|
template <class Model> FFState *LanguageModelKen<Model>::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const
|
||||||
|
{
|
||||||
const lm::ngram::State &in_state = static_cast<const KenLMState&>(*ps).state;
|
const lm::ngram::State &in_state = static_cast<const KenLMState&>(*ps).state;
|
||||||
|
|
||||||
std::auto_ptr<KenLMState> ret(new KenLMState());
|
std::auto_ptr<KenLMState> ret(new KenLMState());
|
||||||
|
|
||||||
if (!hypo.GetCurrTargetLength()) {
|
if (!hypo.GetCurrTargetLength()) {
|
||||||
ret->state = in_state;
|
ret->state = in_state;
|
||||||
return ret.release();
|
return ret.release();
|
||||||
@ -237,17 +247,17 @@ template <class Model> FFState *LanguageModelKen<Model>::Evaluate(const Hypothes
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (hypo.IsSourceCompleted()) {
|
if (hypo.IsSourceCompleted()) {
|
||||||
// Score end of sentence.
|
// Score end of sentence.
|
||||||
std::vector<lm::WordIndex> indices(m_ngram->Order() - 1);
|
std::vector<lm::WordIndex> indices(m_ngram->Order() - 1);
|
||||||
const lm::WordIndex *last = LastIDs(hypo, &indices.front());
|
const lm::WordIndex *last = LastIDs(hypo, &indices.front());
|
||||||
score += m_ngram->FullScoreForgotState(&indices.front(), last, m_ngram->GetVocabulary().EndSentence(), ret->state).prob;
|
score += m_ngram->FullScoreForgotState(&indices.front(), last, m_ngram->GetVocabulary().EndSentence(), ret->state).prob;
|
||||||
} else if (adjust_end < end) {
|
} else if (adjust_end < end) {
|
||||||
// Get state after adding a long phrase.
|
// Get state after adding a long phrase.
|
||||||
std::vector<lm::WordIndex> indices(m_ngram->Order() - 1);
|
std::vector<lm::WordIndex> indices(m_ngram->Order() - 1);
|
||||||
const lm::WordIndex *last = LastIDs(hypo, &indices.front());
|
const lm::WordIndex *last = LastIDs(hypo, &indices.front());
|
||||||
m_ngram->GetState(&indices.front(), last, ret->state);
|
m_ngram->GetState(&indices.front(), last, ret->state);
|
||||||
} else if (state0 != &ret->state) {
|
} else if (state0 != &ret->state) {
|
||||||
// Short enough phrase that we can just reuse the state.
|
// Short enough phrase that we can just reuse the state.
|
||||||
ret->state = *state0;
|
ret->state = *state0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -265,32 +275,37 @@ template <class Model> FFState *LanguageModelKen<Model>::Evaluate(const Hypothes
|
|||||||
return ret.release();
|
return ret.release();
|
||||||
}
|
}
|
||||||
|
|
||||||
class LanguageModelChartStateKenLM : public FFState {
|
class LanguageModelChartStateKenLM : public FFState
|
||||||
public:
|
{
|
||||||
LanguageModelChartStateKenLM() {}
|
public:
|
||||||
|
LanguageModelChartStateKenLM() {}
|
||||||
|
|
||||||
const lm::ngram::ChartState &GetChartState() const { return m_state; }
|
const lm::ngram::ChartState &GetChartState() const {
|
||||||
lm::ngram::ChartState &GetChartState() { return m_state; }
|
return m_state;
|
||||||
|
}
|
||||||
|
lm::ngram::ChartState &GetChartState() {
|
||||||
|
return m_state;
|
||||||
|
}
|
||||||
|
|
||||||
int Compare(const FFState& o) const
|
int Compare(const FFState& o) const {
|
||||||
{
|
const LanguageModelChartStateKenLM &other = static_cast<const LanguageModelChartStateKenLM&>(o);
|
||||||
const LanguageModelChartStateKenLM &other = static_cast<const LanguageModelChartStateKenLM&>(o);
|
int ret = m_state.Compare(other.m_state);
|
||||||
int ret = m_state.Compare(other.m_state);
|
return ret;
|
||||||
return ret;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
lm::ngram::ChartState m_state;
|
lm::ngram::ChartState m_state;
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class Model> FFState *LanguageModelKen<Model>::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection *accumulator) const {
|
template <class Model> FFState *LanguageModelKen<Model>::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection *accumulator) const
|
||||||
|
{
|
||||||
LanguageModelChartStateKenLM *newState = new LanguageModelChartStateKenLM();
|
LanguageModelChartStateKenLM *newState = new LanguageModelChartStateKenLM();
|
||||||
lm::ngram::RuleScore<Model> ruleScore(*m_ngram, newState->GetChartState());
|
lm::ngram::RuleScore<Model> ruleScore(*m_ngram, newState->GetChartState());
|
||||||
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap = hypo.GetCurrTargetPhrase().GetAlignmentInfo().GetNonTermIndexMap();
|
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap = hypo.GetCurrTargetPhrase().GetAlignmentInfo().GetNonTermIndexMap();
|
||||||
|
|
||||||
const size_t size = hypo.GetCurrTargetPhrase().GetSize();
|
const size_t size = hypo.GetCurrTargetPhrase().GetSize();
|
||||||
size_t phrasePos = 0;
|
size_t phrasePos = 0;
|
||||||
// Special cases for first word.
|
// Special cases for first word.
|
||||||
if (size) {
|
if (size) {
|
||||||
const Word &word = hypo.GetCurrTargetPhrase().GetWord(0);
|
const Word &word = hypo.GetCurrTargetPhrase().GetWord(0);
|
||||||
if (word.GetFactor(m_factorType) == m_beginSentenceFactor) {
|
if (word.GetFactor(m_factorType) == m_beginSentenceFactor) {
|
||||||
@ -298,7 +313,7 @@ template <class Model> FFState *LanguageModelKen<Model>::EvaluateChart(const Cha
|
|||||||
ruleScore.BeginSentence();
|
ruleScore.BeginSentence();
|
||||||
phrasePos++;
|
phrasePos++;
|
||||||
} else if (word.IsNonTerminal()) {
|
} else if (word.IsNonTerminal()) {
|
||||||
// Non-terminal is first so we can copy instead of rescoring.
|
// Non-terminal is first so we can copy instead of rescoring.
|
||||||
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndexMap[phrasePos]);
|
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndexMap[phrasePos]);
|
||||||
const lm::ngram::ChartState &prevState = static_cast<const LanguageModelChartStateKenLM*>(prevHypo->GetFFState(featureID))->GetChartState();
|
const lm::ngram::ChartState &prevState = static_cast<const LanguageModelChartStateKenLM*>(prevHypo->GetFFState(featureID))->GetChartState();
|
||||||
ruleScore.BeginNonTerminal(prevState, prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0]);
|
ruleScore.BeginNonTerminal(prevState, prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0]);
|
||||||
@ -323,24 +338,25 @@ template <class Model> FFState *LanguageModelKen<Model>::EvaluateChart(const Cha
|
|||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
LanguageModel *ConstructKenLM(const std::string &file, ScoreIndexManager &manager, FactorType factorType, bool lazy) {
|
LanguageModel *ConstructKenLM(const std::string &file, ScoreIndexManager &manager, FactorType factorType, bool lazy)
|
||||||
|
{
|
||||||
try {
|
try {
|
||||||
lm::ngram::ModelType model_type;
|
lm::ngram::ModelType model_type;
|
||||||
if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) {
|
if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) {
|
||||||
switch(model_type) {
|
switch(model_type) {
|
||||||
case lm::ngram::HASH_PROBING:
|
case lm::ngram::HASH_PROBING:
|
||||||
return new LanguageModelKen<lm::ngram::ProbingModel>(file, manager, factorType, lazy);
|
return new LanguageModelKen<lm::ngram::ProbingModel>(file, manager, factorType, lazy);
|
||||||
case lm::ngram::TRIE_SORTED:
|
case lm::ngram::TRIE_SORTED:
|
||||||
return new LanguageModelKen<lm::ngram::TrieModel>(file, manager, factorType, lazy);
|
return new LanguageModelKen<lm::ngram::TrieModel>(file, manager, factorType, lazy);
|
||||||
case lm::ngram::QUANT_TRIE_SORTED:
|
case lm::ngram::QUANT_TRIE_SORTED:
|
||||||
return new LanguageModelKen<lm::ngram::QuantTrieModel>(file, manager, factorType, lazy);
|
return new LanguageModelKen<lm::ngram::QuantTrieModel>(file, manager, factorType, lazy);
|
||||||
case lm::ngram::ARRAY_TRIE_SORTED:
|
case lm::ngram::ARRAY_TRIE_SORTED:
|
||||||
return new LanguageModelKen<lm::ngram::ArrayTrieModel>(file, manager, factorType, lazy);
|
return new LanguageModelKen<lm::ngram::ArrayTrieModel>(file, manager, factorType, lazy);
|
||||||
case lm::ngram::QUANT_ARRAY_TRIE_SORTED:
|
case lm::ngram::QUANT_ARRAY_TRIE_SORTED:
|
||||||
return new LanguageModelKen<lm::ngram::QuantArrayTrieModel>(file, manager, factorType, lazy);
|
return new LanguageModelKen<lm::ngram::QuantArrayTrieModel>(file, manager, factorType, lazy);
|
||||||
default:
|
default:
|
||||||
std::cerr << "Unrecognized kenlm model type " << model_type << std::endl;
|
std::cerr << "Unrecognized kenlm model type " << model_type << std::endl;
|
||||||
abort();
|
abort();
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
return new LanguageModelKen<lm::ngram::ProbingModel>(file, manager, factorType, lazy);
|
return new LanguageModelKen<lm::ngram::ProbingModel>(file, manager, factorType, lazy);
|
||||||
|
@ -26,12 +26,13 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|||||||
|
|
||||||
#include "TypeDef.h"
|
#include "TypeDef.h"
|
||||||
|
|
||||||
namespace Moses {
|
namespace Moses
|
||||||
|
{
|
||||||
|
|
||||||
class ScoreIndexManager;
|
class ScoreIndexManager;
|
||||||
class LanguageModel;
|
class LanguageModel;
|
||||||
|
|
||||||
// This will also load.
|
// This will also load.
|
||||||
LanguageModel *ConstructKenLM(const std::string &file, ScoreIndexManager &manager, FactorType factorType, bool lazy);
|
LanguageModel *ConstructKenLM(const std::string &file, ScoreIndexManager &manager, FactorType factorType, bool lazy);
|
||||||
|
|
||||||
} // namespace Moses
|
} // namespace Moses
|
||||||
|
@ -9,10 +9,11 @@
|
|||||||
#include "LM/ORLM.h"
|
#include "LM/ORLM.h"
|
||||||
|
|
||||||
using std::map;
|
using std::map;
|
||||||
namespace Moses
|
namespace Moses
|
||||||
|
{
|
||||||
|
bool LanguageModelORLM::Load(const std::string &filePath, FactorType factorType,
|
||||||
|
size_t nGramOrder)
|
||||||
{
|
{
|
||||||
bool LanguageModelORLM::Load(const std::string &filePath, FactorType factorType,
|
|
||||||
size_t nGramOrder) {
|
|
||||||
cerr << "Loading LanguageModelORLM..." << endl;
|
cerr << "Loading LanguageModelORLM..." << endl;
|
||||||
m_filePath = filePath;
|
m_filePath = filePath;
|
||||||
m_factorType = factorType;
|
m_factorType = factorType;
|
||||||
@ -26,13 +27,14 @@ bool LanguageModelORLM::Load(const std::string &filePath, FactorType factorType,
|
|||||||
CreateFactors();
|
CreateFactors();
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
void LanguageModelORLM::CreateFactors() {
|
void LanguageModelORLM::CreateFactors()
|
||||||
|
{
|
||||||
FactorCollection &factorCollection = FactorCollection::Instance();
|
FactorCollection &factorCollection = FactorCollection::Instance();
|
||||||
size_t maxFactorId = 0; // to create lookup vector later on
|
size_t maxFactorId = 0; // to create lookup vector later on
|
||||||
std::map<size_t, wordID_t> m_lmids_map; // map from factor id -> word id
|
std::map<size_t, wordID_t> m_lmids_map; // map from factor id -> word id
|
||||||
|
|
||||||
for(std::map<Word, wordID_t>::const_iterator vIter = m_lm->vocab_->VocabStart();
|
for(std::map<Word, wordID_t>::const_iterator vIter = m_lm->vocab_->VocabStart();
|
||||||
vIter != m_lm->vocab_->VocabEnd(); vIter++){
|
vIter != m_lm->vocab_->VocabEnd(); vIter++) {
|
||||||
// get word from ORLM vocab and associate with (new) factor id
|
// get word from ORLM vocab and associate with (new) factor id
|
||||||
size_t factorId = factorCollection.AddFactor(Output,m_factorType,vIter->first.ToString())->GetId();
|
size_t factorId = factorCollection.AddFactor(Output,m_factorType,vIter->first.ToString())->GetId();
|
||||||
m_lmids_map[factorId] = vIter->second;
|
m_lmids_map[factorId] = vIter->second;
|
||||||
@ -50,7 +52,7 @@ void LanguageModelORLM::CreateFactors() {
|
|||||||
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
|
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
|
||||||
m_sentenceEndArray[m_factorType] = m_sentenceEnd;
|
m_sentenceEndArray[m_factorType] = m_sentenceEnd;
|
||||||
// add to lookup vector in object
|
// add to lookup vector in object
|
||||||
lm_ids_vec_.resize(maxFactorId+1);
|
lm_ids_vec_.resize(maxFactorId+1);
|
||||||
// fill with OOV code
|
// fill with OOV code
|
||||||
fill(lm_ids_vec_.begin(), lm_ids_vec_.end(), m_oov_id);
|
fill(lm_ids_vec_.begin(), lm_ids_vec_.end(), m_oov_id);
|
||||||
|
|
||||||
@ -58,15 +60,18 @@ void LanguageModelORLM::CreateFactors() {
|
|||||||
iter != m_lmids_map.end() ; ++iter)
|
iter != m_lmids_map.end() ; ++iter)
|
||||||
lm_ids_vec_[iter->first] = iter->second;
|
lm_ids_vec_[iter->first] = iter->second;
|
||||||
}
|
}
|
||||||
wordID_t LanguageModelORLM::GetLmID(const std::string& str) const {
|
wordID_t LanguageModelORLM::GetLmID(const std::string& str) const
|
||||||
|
{
|
||||||
return m_lm->vocab_->GetWordID(str);
|
return m_lm->vocab_->GetWordID(str);
|
||||||
}
|
}
|
||||||
wordID_t LanguageModelORLM::GetLmID(const Factor* factor) const {
|
wordID_t LanguageModelORLM::GetLmID(const Factor* factor) const
|
||||||
|
{
|
||||||
size_t factorId = factor->GetId();
|
size_t factorId = factor->GetId();
|
||||||
return (factorId >= lm_ids_vec_.size()) ? m_oov_id : lm_ids_vec_[factorId];
|
return (factorId >= lm_ids_vec_.size()) ? m_oov_id : lm_ids_vec_[factorId];
|
||||||
}
|
}
|
||||||
LMResult LanguageModelORLM::GetValue(const std::vector<const Word*> &contextFactor,
|
LMResult LanguageModelORLM::GetValue(const std::vector<const Word*> &contextFactor,
|
||||||
State* finalState) const {
|
State* finalState) const
|
||||||
|
{
|
||||||
FactorType factorType = GetFactorType();
|
FactorType factorType = GetFactorType();
|
||||||
// set up context
|
// set up context
|
||||||
//std::vector<long unsigned int> factor(1,0);
|
//std::vector<long unsigned int> factor(1,0);
|
||||||
@ -88,13 +93,14 @@ LMResult LanguageModelORLM::GetValue(const std::vector<const Word*> &contextFact
|
|||||||
*/
|
*/
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
bool LanguageModelORLM::UpdateORLM(const std::vector<string>& ngram, const int value) {
|
bool LanguageModelORLM::UpdateORLM(const std::vector<string>& ngram, const int value)
|
||||||
|
{
|
||||||
/*cerr << "Inserting into ORLM: \"";
|
/*cerr << "Inserting into ORLM: \"";
|
||||||
iterate(ngram, nit)
|
iterate(ngram, nit)
|
||||||
cerr << *nit << " ";
|
cerr << *nit << " ";
|
||||||
cerr << "\"\t" << value << endl; */
|
cerr << "\"\t" << value << endl; */
|
||||||
m_lm->vocab_->MakeOpen();
|
m_lm->vocab_->MakeOpen();
|
||||||
bool res = m_lm->update(ngram, value);
|
bool res = m_lm->update(ngram, value);
|
||||||
m_lm->vocab_->MakeClosed();
|
m_lm->vocab_->MakeClosed();
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
@ -15,7 +15,8 @@ namespace Moses
|
|||||||
class Factor;
|
class Factor;
|
||||||
class Phrase;
|
class Phrase;
|
||||||
|
|
||||||
class LanguageModelORLM : public LanguageModelPointerState {
|
class LanguageModelORLM : public LanguageModelPointerState
|
||||||
|
{
|
||||||
public:
|
public:
|
||||||
typedef count_t T; // type for ORLM filter
|
typedef count_t T; // type for ORLM filter
|
||||||
LanguageModelORLM()
|
LanguageModelORLM()
|
||||||
@ -30,13 +31,15 @@ public:
|
|||||||
fout.close();
|
fout.close();
|
||||||
delete m_lm;
|
delete m_lm;
|
||||||
}
|
}
|
||||||
void CleanUpAfterSentenceProcessing() {m_lm->clearCache();} // clear caches
|
void CleanUpAfterSentenceProcessing() {
|
||||||
|
m_lm->clearCache(); // clear caches
|
||||||
|
}
|
||||||
void InitializeBeforeSentenceProcessing() { // nothing to do
|
void InitializeBeforeSentenceProcessing() { // nothing to do
|
||||||
//m_lm->initThreadSpecificData(); // Creates thread specific data iff
|
//m_lm->initThreadSpecificData(); // Creates thread specific data iff
|
||||||
// compiled with multithreading.
|
// compiled with multithreading.
|
||||||
}
|
}
|
||||||
bool UpdateORLM(const std::vector<string>& ngram, const int value);
|
bool UpdateORLM(const std::vector<string>& ngram, const int value);
|
||||||
protected:
|
protected:
|
||||||
OnlineRLM<T>* m_lm;
|
OnlineRLM<T>* m_lm;
|
||||||
//MultiOnlineRLM<T>* m_lm;
|
//MultiOnlineRLM<T>* m_lm;
|
||||||
wordID_t m_oov_id;
|
wordID_t m_oov_id;
|
||||||
|
@ -347,7 +347,8 @@ const FFState *LanguageModelParallelBackoff::GetBeginSentenceState() const
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
LanguageModelMultiFactor *NewParallelBackoff() {
|
LanguageModelMultiFactor *NewParallelBackoff()
|
||||||
|
{
|
||||||
return new LanguageModelParallelBackoff();
|
return new LanguageModelParallelBackoff();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -38,7 +38,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|||||||
|
|
||||||
namespace Moses
|
namespace Moses
|
||||||
{
|
{
|
||||||
namespace
|
namespace
|
||||||
{
|
{
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
@ -57,7 +57,7 @@ public:
|
|||||||
}
|
}
|
||||||
void InitializeBeforeSentenceProcessing() {
|
void InitializeBeforeSentenceProcessing() {
|
||||||
m_lm->initThreadSpecificData(); // Creates thread specific data iff
|
m_lm->initThreadSpecificData(); // Creates thread specific data iff
|
||||||
// compiled with multithreading.
|
// compiled with multithreading.
|
||||||
}
|
}
|
||||||
protected:
|
protected:
|
||||||
std::vector<randlm::WordID> m_randlm_ids_vec;
|
std::vector<randlm::WordID> m_randlm_ids_vec;
|
||||||
@ -133,7 +133,7 @@ randlm::WordID LanguageModelRandLM::GetLmID( const std::string &str ) const
|
|||||||
}
|
}
|
||||||
|
|
||||||
LMResult LanguageModelRandLM::GetValue(const vector<const Word*> &contextFactor,
|
LMResult LanguageModelRandLM::GetValue(const vector<const Word*> &contextFactor,
|
||||||
State* finalState) const
|
State* finalState) const
|
||||||
{
|
{
|
||||||
FactorType factorType = GetFactorType();
|
FactorType factorType = GetFactorType();
|
||||||
// set up context
|
// set up context
|
||||||
@ -156,7 +156,8 @@ LMResult LanguageModelRandLM::GetValue(const vector<const Word*> &contextFactor,
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
LanguageModelPointerState *NewRandLM() {
|
LanguageModelPointerState *NewRandLM()
|
||||||
|
{
|
||||||
return new LanguageModelRandLM();
|
return new LanguageModelRandLM();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -46,7 +46,7 @@ void LMList::CalcScore(const Phrase &phrase, float &retFullScore, float &retNGra
|
|||||||
const float weightLM = lm.GetWeight();
|
const float weightLM = lm.GetWeight();
|
||||||
const float oovWeightLM = lm.GetOOVWeight();
|
const float oovWeightLM = lm.GetOOVWeight();
|
||||||
|
|
||||||
float fullScore, nGramScore;
|
float fullScore, nGramScore;
|
||||||
size_t oovCount;
|
size_t oovCount;
|
||||||
|
|
||||||
// do not process, if factors not defined yet (happens in partial translation options)
|
// do not process, if factors not defined yet (happens in partial translation options)
|
||||||
@ -64,7 +64,7 @@ void LMList::CalcScore(const Phrase &phrase, float &retFullScore, float &retNGra
|
|||||||
} else {
|
} else {
|
||||||
breakdown->Assign(&lm, nGramScore); // I'm not sure why += doesn't work here- it should be 0.0 right?
|
breakdown->Assign(&lm, nGramScore); // I'm not sure why += doesn't work here- it should be 0.0 right?
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
retFullScore += fullScore * weightLM;
|
retFullScore += fullScore * weightLM;
|
||||||
retNGramScore += nGramScore * weightLM;
|
retNGramScore += nGramScore * weightLM;
|
||||||
|
@ -39,13 +39,13 @@ public:
|
|||||||
virtual FFState* Evaluate(const Hypothesis& cur_hypo,
|
virtual FFState* Evaluate(const Hypothesis& cur_hypo,
|
||||||
const FFState* prev_state,
|
const FFState* prev_state,
|
||||||
ScoreComponentCollection* accumulator) const;
|
ScoreComponentCollection* accumulator) const;
|
||||||
|
|
||||||
virtual FFState* EvaluateChart(const ChartHypothesis&,
|
virtual FFState* EvaluateChart(const ChartHypothesis&,
|
||||||
int /* featureID */,
|
int /* featureID */,
|
||||||
ScoreComponentCollection*) const {
|
ScoreComponentCollection*) const {
|
||||||
CHECK(0); // not valid for chart decoder
|
CHECK(0); // not valid for chart decoder
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual const FFState* EmptyHypothesisState(const InputType &input) const;
|
virtual const FFState* EmptyHypothesisState(const InputType &input) const;
|
||||||
|
|
||||||
|
@ -267,8 +267,9 @@ struct SGNReverseCompare {
|
|||||||
/**
|
/**
|
||||||
* Implements lattice sampling, as in Chatterjee & Cancedda, emnlp 2010
|
* Implements lattice sampling, as in Chatterjee & Cancedda, emnlp 2010
|
||||||
**/
|
**/
|
||||||
void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
|
void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const
|
||||||
|
{
|
||||||
|
|
||||||
vector<SearchGraphNode> searchGraph;
|
vector<SearchGraphNode> searchGraph;
|
||||||
GetSearchGraph(searchGraph);
|
GetSearchGraph(searchGraph);
|
||||||
|
|
||||||
@ -282,15 +283,15 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
|
|||||||
map<int,const Hypothesis*> idToHyp;
|
map<int,const Hypothesis*> idToHyp;
|
||||||
map<int,float> fscores;
|
map<int,float> fscores;
|
||||||
|
|
||||||
//Iterating through the hypos in reverse order of id gives a reverse
|
//Iterating through the hypos in reverse order of id gives a reverse
|
||||||
//topological order. We rely on the fact that hypo ids are given out
|
//topological order. We rely on the fact that hypo ids are given out
|
||||||
//sequentially, as the search proceeds.
|
//sequentially, as the search proceeds.
|
||||||
//NB: Could just sort by stack.
|
//NB: Could just sort by stack.
|
||||||
sort(searchGraph.begin(), searchGraph.end(), SGNReverseCompare());
|
sort(searchGraph.begin(), searchGraph.end(), SGNReverseCompare());
|
||||||
|
|
||||||
//first task is to fill in the outgoing hypos and edge scores.
|
//first task is to fill in the outgoing hypos and edge scores.
|
||||||
for (vector<SearchGraphNode>::const_iterator i = searchGraph.begin();
|
for (vector<SearchGraphNode>::const_iterator i = searchGraph.begin();
|
||||||
i != searchGraph.end(); ++i) {
|
i != searchGraph.end(); ++i) {
|
||||||
const Hypothesis* hypo = i->hypo;
|
const Hypothesis* hypo = i->hypo;
|
||||||
idToHyp[hypo->GetId()] = hypo;
|
idToHyp[hypo->GetId()] = hypo;
|
||||||
fscores[hypo->GetId()] = i->fscore;
|
fscores[hypo->GetId()] = i->fscore;
|
||||||
@ -298,7 +299,7 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
|
|||||||
//back to current
|
//back to current
|
||||||
const Hypothesis* prevHypo = i->hypo->GetPrevHypo();
|
const Hypothesis* prevHypo = i->hypo->GetPrevHypo();
|
||||||
outgoingHyps[prevHypo].insert(hypo);
|
outgoingHyps[prevHypo].insert(hypo);
|
||||||
edgeScores[Edge(prevHypo->GetId(),hypo->GetId())] =
|
edgeScores[Edge(prevHypo->GetId(),hypo->GetId())] =
|
||||||
hypo->GetScore() - prevHypo->GetScore();
|
hypo->GetScore() - prevHypo->GetScore();
|
||||||
}
|
}
|
||||||
//forward from current
|
//forward from current
|
||||||
@ -309,7 +310,7 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
|
|||||||
outgoingHyps[hypo].insert(nextHypo);
|
outgoingHyps[hypo].insert(nextHypo);
|
||||||
map<int,float>::const_iterator fscoreIter = fscores.find(nextHypo->GetId());
|
map<int,float>::const_iterator fscoreIter = fscores.find(nextHypo->GetId());
|
||||||
CHECK(fscoreIter != fscores.end());
|
CHECK(fscoreIter != fscores.end());
|
||||||
edgeScores[Edge(hypo->GetId(),nextHypo->GetId())] =
|
edgeScores[Edge(hypo->GetId(),nextHypo->GetId())] =
|
||||||
i->fscore - fscoreIter->second;
|
i->fscore - fscoreIter->second;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -317,26 +318,26 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
|
|||||||
|
|
||||||
//then run through again to calculate sigmas
|
//then run through again to calculate sigmas
|
||||||
for (vector<SearchGraphNode>::const_iterator i = searchGraph.begin();
|
for (vector<SearchGraphNode>::const_iterator i = searchGraph.begin();
|
||||||
i != searchGraph.end(); ++i) {
|
i != searchGraph.end(); ++i) {
|
||||||
|
|
||||||
if (i->forward == -1) {
|
if (i->forward == -1) {
|
||||||
sigmas[i->hypo] = 0;
|
sigmas[i->hypo] = 0;
|
||||||
} else {
|
} else {
|
||||||
map<const Hypothesis*, set<const Hypothesis*> >::const_iterator outIter =
|
map<const Hypothesis*, set<const Hypothesis*> >::const_iterator outIter =
|
||||||
outgoingHyps.find(i->hypo);
|
outgoingHyps.find(i->hypo);
|
||||||
|
|
||||||
CHECK(outIter != outgoingHyps.end());
|
CHECK(outIter != outgoingHyps.end());
|
||||||
float sigma = 0;
|
float sigma = 0;
|
||||||
for (set<const Hypothesis*>::const_iterator j = outIter->second.begin();
|
for (set<const Hypothesis*>::const_iterator j = outIter->second.begin();
|
||||||
j != outIter->second.end(); ++j) {
|
j != outIter->second.end(); ++j) {
|
||||||
map<const Hypothesis*, float>::const_iterator succIter = sigmas.find(*j);
|
map<const Hypothesis*, float>::const_iterator succIter = sigmas.find(*j);
|
||||||
CHECK(succIter != sigmas.end());
|
CHECK(succIter != sigmas.end());
|
||||||
map<Edge,float>::const_iterator edgeScoreIter =
|
map<Edge,float>::const_iterator edgeScoreIter =
|
||||||
edgeScores.find(Edge(i->hypo->GetId(),(*j)->GetId()));
|
edgeScores.find(Edge(i->hypo->GetId(),(*j)->GetId()));
|
||||||
CHECK(edgeScoreIter != edgeScores.end());
|
CHECK(edgeScoreIter != edgeScores.end());
|
||||||
float term = edgeScoreIter->second + succIter->second; // Add sigma(*j)
|
float term = edgeScoreIter->second + succIter->second; // Add sigma(*j)
|
||||||
if (sigma == 0) {
|
if (sigma == 0) {
|
||||||
sigma = term;
|
sigma = term;
|
||||||
} else {
|
} else {
|
||||||
sigma = log_sum(sigma,term);
|
sigma = log_sum(sigma,term);
|
||||||
}
|
}
|
||||||
@ -352,7 +353,7 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
|
|||||||
vector<const Hypothesis*> path;
|
vector<const Hypothesis*> path;
|
||||||
path.push_back(startHypo);
|
path.push_back(startHypo);
|
||||||
while(1) {
|
while(1) {
|
||||||
map<const Hypothesis*, set<const Hypothesis*> >::const_iterator outIter =
|
map<const Hypothesis*, set<const Hypothesis*> >::const_iterator outIter =
|
||||||
outgoingHyps.find(path.back());
|
outgoingHyps.find(path.back());
|
||||||
if (outIter == outgoingHyps.end() || !outIter->second.size()) {
|
if (outIter == outgoingHyps.end() || !outIter->second.size()) {
|
||||||
//end of the path
|
//end of the path
|
||||||
@ -363,7 +364,7 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
|
|||||||
vector<float> candidateScores;
|
vector<float> candidateScores;
|
||||||
float scoreTotal = 0;
|
float scoreTotal = 0;
|
||||||
for (set<const Hypothesis*>::const_iterator j = outIter->second.begin();
|
for (set<const Hypothesis*>::const_iterator j = outIter->second.begin();
|
||||||
j != outIter->second.end(); ++j) {
|
j != outIter->second.end(); ++j) {
|
||||||
candidates.push_back(*j);
|
candidates.push_back(*j);
|
||||||
CHECK(sigmas.find(*j) != sigmas.end());
|
CHECK(sigmas.find(*j) != sigmas.end());
|
||||||
Edge edge(path.back()->GetId(),(*j)->GetId());
|
Edge edge(path.back()->GetId(),(*j)->GetId());
|
||||||
@ -390,18 +391,18 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
|
|||||||
}
|
}
|
||||||
//cerr << "Random: " << random << " Chose " << position-1 << endl;
|
//cerr << "Random: " << random << " Chose " << position-1 << endl;
|
||||||
const Hypothesis* chosen = candidates[position-1];
|
const Hypothesis* chosen = candidates[position-1];
|
||||||
path.push_back(chosen);
|
path.push_back(chosen);
|
||||||
}
|
}
|
||||||
//cerr << "Path: " << endl;
|
//cerr << "Path: " << endl;
|
||||||
//for (size_t j = 0; j < path.size(); ++j) {
|
//for (size_t j = 0; j < path.size(); ++j) {
|
||||||
// cerr << path[j]->GetId() << " " << path[j]->GetScoreBreakdown() << endl;
|
// cerr << path[j]->GetId() << " " << path[j]->GetScoreBreakdown() << endl;
|
||||||
//}
|
//}
|
||||||
//cerr << endl;
|
//cerr << endl;
|
||||||
|
|
||||||
//Convert the hypos to TrellisPath
|
//Convert the hypos to TrellisPath
|
||||||
ret.Add(new TrellisPath(path));
|
ret.Add(new TrellisPath(path));
|
||||||
//cerr << ret.at(ret.GetSize()-1).GetScoreBreakdown() << endl;
|
//cerr << ret.at(ret.GetSize()-1).GetScoreBreakdown() << endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -676,17 +677,17 @@ void OutputSearchNode(long translationId, std::ostream &outputSearchGraphStream,
|
|||||||
else
|
else
|
||||||
outputSearchGraphStream << " hyp=" << searchNode.hypo->GetId();
|
outputSearchGraphStream << " hyp=" << searchNode.hypo->GetId();
|
||||||
|
|
||||||
outputSearchGraphStream << " stack=" << searchNode.hypo->GetWordsBitmap().GetNumWordsCovered()
|
outputSearchGraphStream << " stack=" << searchNode.hypo->GetWordsBitmap().GetNumWordsCovered()
|
||||||
<< " back=" << prevHypo->GetId()
|
<< " back=" << prevHypo->GetId()
|
||||||
<< " score=" << searchNode.hypo->GetScore()
|
<< " score=" << searchNode.hypo->GetScore()
|
||||||
<< " transition=" << (searchNode.hypo->GetScore() - prevHypo->GetScore());
|
<< " transition=" << (searchNode.hypo->GetScore() - prevHypo->GetScore());
|
||||||
|
|
||||||
if (searchNode.recombinationHypo != NULL)
|
if (searchNode.recombinationHypo != NULL)
|
||||||
outputSearchGraphStream << " recombined=" << searchNode.recombinationHypo->GetId();
|
outputSearchGraphStream << " recombined=" << searchNode.recombinationHypo->GetId();
|
||||||
|
|
||||||
outputSearchGraphStream << " forward=" << searchNode.forward << " fscore=" << searchNode.fscore
|
outputSearchGraphStream << " forward=" << searchNode.forward << " fscore=" << searchNode.fscore
|
||||||
<< " covered=" << searchNode.hypo->GetCurrSourceWordsRange().GetStartPos()
|
<< " covered=" << searchNode.hypo->GetCurrSourceWordsRange().GetStartPos()
|
||||||
<< "-" << searchNode.hypo->GetCurrSourceWordsRange().GetEndPos();
|
<< "-" << searchNode.hypo->GetCurrSourceWordsRange().GetEndPos();
|
||||||
|
|
||||||
// Modified so that -osgx is a superset of -osg (GST Oct 2011)
|
// Modified so that -osgx is a superset of -osg (GST Oct 2011)
|
||||||
ScoreComponentCollection scoreBreakdown = searchNode.hypo->GetScoreBreakdown();
|
ScoreComponentCollection scoreBreakdown = searchNode.hypo->GetScoreBreakdown();
|
||||||
@ -694,10 +695,10 @@ void OutputSearchNode(long translationId, std::ostream &outputSearchGraphStream,
|
|||||||
outputSearchGraphStream << " scores=[ ";
|
outputSearchGraphStream << " scores=[ ";
|
||||||
StaticData::Instance().GetScoreIndexManager().PrintLabeledScores( outputSearchGraphStream, scoreBreakdown );
|
StaticData::Instance().GetScoreIndexManager().PrintLabeledScores( outputSearchGraphStream, scoreBreakdown );
|
||||||
outputSearchGraphStream << " ]";
|
outputSearchGraphStream << " ]";
|
||||||
|
|
||||||
|
|
||||||
outputSearchGraphStream << " out=" << searchNode.hypo->GetSourcePhraseStringRep() << "|" <<
|
outputSearchGraphStream << " out=" << searchNode.hypo->GetSourcePhraseStringRep() << "|" <<
|
||||||
searchNode.hypo->GetCurrTargetPhrase().GetStringRep(outputFactorOrder) << endl;
|
searchNode.hypo->GetCurrTargetPhrase().GetStringRep(outputFactorOrder) << endl;
|
||||||
// outputSearchGraphStream << " out=" << searchNode.hypo->GetCurrTargetPhrase().GetStringRep(outputFactorOrder) << endl;
|
// outputSearchGraphStream << " out=" << searchNode.hypo->GetCurrTargetPhrase().GetStringRep(outputFactorOrder) << endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -36,7 +36,7 @@ namespace PCN
|
|||||||
typedef std::pair<std::pair<std::string, std::vector<float> >, size_t> CNAlt;
|
typedef std::pair<std::pair<std::string, std::vector<float> >, size_t> CNAlt;
|
||||||
typedef std::vector<CNAlt> CNCol;
|
typedef std::vector<CNAlt> CNCol;
|
||||||
typedef std::vector<CNCol> CN;
|
typedef std::vector<CNCol> CN;
|
||||||
|
|
||||||
/** Given a string ((('foo',0.1,1),('bar',0.9,2)),...) representation of a
|
/** Given a string ((('foo',0.1,1),('bar',0.9,2)),...) representation of a
|
||||||
* word lattice in PCN format, return a CN object representing the lattice
|
* word lattice in PCN format, return a CN object representing the lattice
|
||||||
*/
|
*/
|
||||||
|
@ -71,10 +71,10 @@ Parameter::Parameter()
|
|||||||
AddParam("report-all-factors-in-n-best", "Report all factors in n-best-lists. Default is false");
|
AddParam("report-all-factors-in-n-best", "Report all factors in n-best-lists. Default is false");
|
||||||
AddParam("report-segmentation", "t", "report phrase segmentation in the output");
|
AddParam("report-segmentation", "t", "report phrase segmentation in the output");
|
||||||
#ifdef HAVE_SYNLM
|
#ifdef HAVE_SYNLM
|
||||||
AddParam("slmodel-file", "location of the syntactic language model file(s)");
|
AddParam("slmodel-file", "location of the syntactic language model file(s)");
|
||||||
AddParam("weight-slm", "slm", "weight(s) for syntactic language model");
|
AddParam("weight-slm", "slm", "weight(s) for syntactic language model");
|
||||||
AddParam("slmodel-factor", "factor to use with syntactic language model");
|
AddParam("slmodel-factor", "factor to use with syntactic language model");
|
||||||
AddParam("slmodel-beam", "beam width to use with syntactic language model's parser");
|
AddParam("slmodel-beam", "beam width to use with syntactic language model's parser");
|
||||||
#endif
|
#endif
|
||||||
AddParam("stack", "s", "maximum stack size for histogram pruning");
|
AddParam("stack", "s", "maximum stack size for histogram pruning");
|
||||||
AddParam("stack-diversity", "sd", "minimum number of hypothesis of each coverage in stack (default 0)");
|
AddParam("stack-diversity", "sd", "minimum number of hypothesis of each coverage in stack (default 0)");
|
||||||
@ -277,14 +277,13 @@ bool Parameter::Validate()
|
|||||||
PARAM_MAP::const_iterator iterParams;
|
PARAM_MAP::const_iterator iterParams;
|
||||||
for (iterParams = m_setting.begin(); iterParams != m_setting.end(); ++iterParams) {
|
for (iterParams = m_setting.begin(); iterParams != m_setting.end(); ++iterParams) {
|
||||||
const std::string &key = iterParams->first;
|
const std::string &key = iterParams->first;
|
||||||
|
|
||||||
if (m_valid.find(key) == m_valid.end())
|
if (m_valid.find(key) == m_valid.end()) {
|
||||||
{
|
|
||||||
UserMessage::Add("Unknown parameter " + key);
|
UserMessage::Add("Unknown parameter " + key);
|
||||||
noErrorFlag = false;
|
noErrorFlag = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// required parameters
|
// required parameters
|
||||||
if (m_setting["ttable-file"].size() == 0) {
|
if (m_setting["ttable-file"].size() == 0) {
|
||||||
@ -307,7 +306,7 @@ bool Parameter::Validate()
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (m_setting["lmodel-file"].size() * (m_setting.find("lmodel-oov-feature") != m_setting.end() ? 2 : 1)
|
if (m_setting["lmodel-file"].size() * (m_setting.find("lmodel-oov-feature") != m_setting.end() ? 2 : 1)
|
||||||
!= m_setting["weight-l"].size()) {
|
!= m_setting["weight-l"].size()) {
|
||||||
stringstream errorMsg("");
|
stringstream errorMsg("");
|
||||||
errorMsg << "Config and parameters specify "
|
errorMsg << "Config and parameters specify "
|
||||||
<< static_cast<int>(m_setting["lmodel-file"].size())
|
<< static_cast<int>(m_setting["lmodel-file"].size())
|
||||||
@ -457,8 +456,7 @@ bool Parameter::ReadConfigFile(const string &filePath )
|
|||||||
|
|
||||||
if (line.size() == 0) {
|
if (line.size() == 0) {
|
||||||
// blank line. do nothing.
|
// blank line. do nothing.
|
||||||
}
|
} else if (line[0]=='[') {
|
||||||
else if (line[0]=='[') {
|
|
||||||
// new parameter
|
// new parameter
|
||||||
for (size_t currPos = 0 ; currPos < line.size() ; currPos++) {
|
for (size_t currPos = 0 ; currPos < line.size() ; currPos++) {
|
||||||
if (line[currPos] == ']') {
|
if (line[currPos] == ']') {
|
||||||
|
@ -143,9 +143,9 @@ void Phrase::CreateFromString(const std::vector<FactorType> &factorOrder, const
|
|||||||
for (util::TokenIter<util::AnyCharacter, true> word_it(phraseString, util::AnyCharacter(" \t")); word_it; ++word_it) {
|
for (util::TokenIter<util::AnyCharacter, true> word_it(phraseString, util::AnyCharacter(" \t")); word_it; ++word_it) {
|
||||||
Word &word = AddWord();
|
Word &word = AddWord();
|
||||||
size_t index = 0;
|
size_t index = 0;
|
||||||
for (util::TokenIter<util::MultiCharacter, false> factor_it(*word_it, util::MultiCharacter(factorDelimiter));
|
for (util::TokenIter<util::MultiCharacter, false> factor_it(*word_it, util::MultiCharacter(factorDelimiter));
|
||||||
factor_it && (index < factorOrder.size());
|
factor_it && (index < factorOrder.size());
|
||||||
++factor_it, ++index) {
|
++factor_it, ++index) {
|
||||||
word[factorOrder[index]] = factorCollection.AddFactor(*factor_it);
|
word[factorOrder[index]] = factorCollection.AddFactor(*factor_it);
|
||||||
}
|
}
|
||||||
if (index != factorOrder.size()) {
|
if (index != factorOrder.size()) {
|
||||||
|
@ -61,7 +61,7 @@ public:
|
|||||||
/** Fills phrase with words from format string, typically from phrase table or sentence input
|
/** Fills phrase with words from format string, typically from phrase table or sentence input
|
||||||
* \param factorOrder factor types of each element in 2D string vector
|
* \param factorOrder factor types of each element in 2D string vector
|
||||||
* \param phraseString formatted input string to parse
|
* \param phraseString formatted input string to parse
|
||||||
* \param factorDelimiter delimiter between factors.
|
* \param factorDelimiter delimiter between factors.
|
||||||
*/
|
*/
|
||||||
void CreateFromString(const std::vector<FactorType> &factorOrder, const StringPiece &phraseString, const StringPiece &factorDelimiter);
|
void CreateFromString(const std::vector<FactorType> &factorOrder, const StringPiece &phraseString, const StringPiece &factorDelimiter);
|
||||||
|
|
||||||
|
@ -136,7 +136,7 @@ PhraseDictionary* PhraseDictionaryFeature::LoadPhraseTable(const TranslationSyst
|
|||||||
m_filePath += ".gz";
|
m_filePath += ".gz";
|
||||||
VERBOSE(2,"Using gzipped file" << std::endl);
|
VERBOSE(2,"Using gzipped file" << std::endl);
|
||||||
}
|
}
|
||||||
|
|
||||||
PhraseDictionaryHiero* pdm = new PhraseDictionaryHiero(m_numScoreComponent,this);
|
PhraseDictionaryHiero* pdm = new PhraseDictionaryHiero(m_numScoreComponent,this);
|
||||||
bool ret = pdm->Load(GetInput()
|
bool ret = pdm->Load(GetInput()
|
||||||
, GetOutput()
|
, GetOutput()
|
||||||
@ -154,7 +154,7 @@ PhraseDictionary* PhraseDictionaryFeature::LoadPhraseTable(const TranslationSyst
|
|||||||
m_filePath += ".gz";
|
m_filePath += ".gz";
|
||||||
VERBOSE(2,"Using gzipped file" << std::endl);
|
VERBOSE(2,"Using gzipped file" << std::endl);
|
||||||
}
|
}
|
||||||
|
|
||||||
PhraseDictionaryALSuffixArray* pdm = new PhraseDictionaryALSuffixArray(m_numScoreComponent,this);
|
PhraseDictionaryALSuffixArray* pdm = new PhraseDictionaryALSuffixArray(m_numScoreComponent,this);
|
||||||
bool ret = pdm->Load(GetInput()
|
bool ret = pdm->Load(GetInput()
|
||||||
, GetOutput()
|
, GetOutput()
|
||||||
@ -255,18 +255,18 @@ PhraseDictionaryFeature::~PhraseDictionaryFeature()
|
|||||||
|
|
||||||
std::string PhraseDictionaryFeature::GetScoreProducerDescription(unsigned idx) const
|
std::string PhraseDictionaryFeature::GetScoreProducerDescription(unsigned idx) const
|
||||||
{
|
{
|
||||||
if (idx < GetNumInputScores()){
|
if (idx < GetNumInputScores()) {
|
||||||
return "InputScore";
|
return "InputScore";
|
||||||
}else{
|
} else {
|
||||||
return "PhraseModel";
|
return "PhraseModel";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string PhraseDictionaryFeature::GetScoreProducerWeightShortName(unsigned idx) const
|
std::string PhraseDictionaryFeature::GetScoreProducerWeightShortName(unsigned idx) const
|
||||||
{
|
{
|
||||||
if (idx < GetNumInputScores()){
|
if (idx < GetNumInputScores()) {
|
||||||
return "I";
|
return "I";
|
||||||
}else{
|
} else {
|
||||||
return "tm";
|
return "tm";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -16,16 +16,16 @@
|
|||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
namespace Moses
|
namespace Moses
|
||||||
{
|
{
|
||||||
|
|
||||||
bool PhraseDictionaryALSuffixArray::Load(const std::vector<FactorType> &input
|
bool PhraseDictionaryALSuffixArray::Load(const std::vector<FactorType> &input
|
||||||
, const std::vector<FactorType> &output
|
, const std::vector<FactorType> &output
|
||||||
, const std::string &filePath
|
, const std::string &filePath
|
||||||
, const std::vector<float> &weight
|
, const std::vector<float> &weight
|
||||||
, size_t tableLimit
|
, size_t tableLimit
|
||||||
, const LMList &languageModels
|
, const LMList &languageModels
|
||||||
, const WordPenaltyProducer* wpProducer)
|
, const WordPenaltyProducer* wpProducer)
|
||||||
{
|
{
|
||||||
// file path is the directory of the rules for eacg, NOT the file of all the rules
|
// file path is the directory of the rules for eacg, NOT the file of all the rules
|
||||||
m_filePath = filePath;
|
m_filePath = filePath;
|
||||||
@ -36,7 +36,7 @@ bool PhraseDictionaryALSuffixArray::Load(const std::vector<FactorType> &input
|
|||||||
m_languageModels = &languageModels;
|
m_languageModels = &languageModels;
|
||||||
m_wpProducer = wpProducer;
|
m_wpProducer = wpProducer;
|
||||||
m_weight = &weight;
|
m_weight = &weight;
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -44,20 +44,20 @@ void PhraseDictionaryALSuffixArray::InitializeForInput(InputType const& source)
|
|||||||
{
|
{
|
||||||
// clear out rules for previous sentence
|
// clear out rules for previous sentence
|
||||||
m_collection.Clear();
|
m_collection.Clear();
|
||||||
|
|
||||||
// populate with rules for this sentence
|
// populate with rules for this sentence
|
||||||
long translationId = source.GetTranslationId();
|
long translationId = source.GetTranslationId();
|
||||||
|
|
||||||
string grammarFile = m_filePath + "/grammar.out." + SPrint(translationId);
|
string grammarFile = m_filePath + "/grammar.out." + SPrint(translationId);
|
||||||
|
|
||||||
// data from file
|
// data from file
|
||||||
InputFileStream inFile(grammarFile);
|
InputFileStream inFile(grammarFile);
|
||||||
|
|
||||||
std::auto_ptr<RuleTableLoader> loader =
|
std::auto_ptr<RuleTableLoader> loader =
|
||||||
RuleTableLoaderFactory::Create(grammarFile);
|
RuleTableLoaderFactory::Create(grammarFile);
|
||||||
bool ret = loader->Load(*m_input, *m_output, inFile, *m_weight, m_tableLimit,
|
bool ret = loader->Load(*m_input, *m_output, inFile, *m_weight, m_tableLimit,
|
||||||
*m_languageModels, m_wpProducer, *this);
|
*m_languageModels, m_wpProducer, *this);
|
||||||
|
|
||||||
CHECK(ret);
|
CHECK(ret);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -11,13 +11,14 @@
|
|||||||
|
|
||||||
#include "PhraseDictionarySCFG.h"
|
#include "PhraseDictionarySCFG.h"
|
||||||
|
|
||||||
namespace Moses {
|
namespace Moses
|
||||||
|
{
|
||||||
|
|
||||||
class PhraseDictionaryALSuffixArray : public PhraseDictionarySCFG
|
class PhraseDictionaryALSuffixArray : public PhraseDictionarySCFG
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
PhraseDictionaryALSuffixArray(size_t numScoreComponent, PhraseDictionaryFeature* feature)
|
PhraseDictionaryALSuffixArray(size_t numScoreComponent, PhraseDictionaryFeature* feature)
|
||||||
: PhraseDictionarySCFG(numScoreComponent,feature) {}
|
: PhraseDictionarySCFG(numScoreComponent,feature) {}
|
||||||
|
|
||||||
bool Load(const std::vector<FactorType> &input
|
bool Load(const std::vector<FactorType> &input
|
||||||
, const std::vector<FactorType> &output
|
, const std::vector<FactorType> &output
|
||||||
@ -34,9 +35,9 @@ protected:
|
|||||||
const LMList *m_languageModels;
|
const LMList *m_languageModels;
|
||||||
const WordPenaltyProducer *m_wpProducer;
|
const WordPenaltyProducer *m_wpProducer;
|
||||||
const std::vector<float> *m_weight;
|
const std::vector<float> *m_weight;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -72,7 +72,7 @@ const TargetPhraseCollection *PhraseDictionaryDynSuffixArray::GetTargetPhraseCol
|
|||||||
void PhraseDictionaryDynSuffixArray::insertSnt(string& source, string& target, string& alignment)
|
void PhraseDictionaryDynSuffixArray::insertSnt(string& source, string& target, string& alignment)
|
||||||
{
|
{
|
||||||
m_biSA->addSntPair(source, target, alignment); // insert sentence pair into suffix arrays
|
m_biSA->addSntPair(source, target, alignment); // insert sentence pair into suffix arrays
|
||||||
//StaticData::Instance().ClearTransOptionCache(); // clear translation option cache
|
//StaticData::Instance().ClearTransOptionCache(); // clear translation option cache
|
||||||
}
|
}
|
||||||
void PhraseDictionaryDynSuffixArray::deleteSnt(unsigned /* idx */, unsigned /* num2Del */)
|
void PhraseDictionaryDynSuffixArray::deleteSnt(unsigned /* idx */, unsigned /* num2Del */)
|
||||||
{
|
{
|
||||||
|
@ -15,30 +15,31 @@
|
|||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
namespace Moses {
|
namespace Moses
|
||||||
|
{
|
||||||
|
|
||||||
bool PhraseDictionaryHiero::Load(const std::vector<FactorType> &input
|
bool PhraseDictionaryHiero::Load(const std::vector<FactorType> &input
|
||||||
, const std::vector<FactorType> &output
|
, const std::vector<FactorType> &output
|
||||||
, const std::string &filePath
|
, const std::string &filePath
|
||||||
, const std::vector<float> &weight
|
, const std::vector<float> &weight
|
||||||
, size_t tableLimit
|
, size_t tableLimit
|
||||||
, const LMList &languageModels
|
, const LMList &languageModels
|
||||||
, const WordPenaltyProducer* wpProducer)
|
, const WordPenaltyProducer* wpProducer)
|
||||||
{
|
{
|
||||||
m_filePath = filePath;
|
m_filePath = filePath;
|
||||||
m_tableLimit = tableLimit;
|
m_tableLimit = tableLimit;
|
||||||
|
|
||||||
|
|
||||||
// data from file
|
// data from file
|
||||||
InputFileStream inFile(filePath);
|
InputFileStream inFile(filePath);
|
||||||
|
|
||||||
std::auto_ptr<RuleTableLoader> loader =
|
std::auto_ptr<RuleTableLoader> loader =
|
||||||
RuleTableLoaderFactory::Create(filePath);
|
RuleTableLoaderFactory::Create(filePath);
|
||||||
bool ret = loader->Load(input, output, inFile, weight, tableLimit,
|
bool ret = loader->Load(input, output, inFile, weight, tableLimit,
|
||||||
languageModels, wpProducer, *this);
|
languageModels, wpProducer, *this);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
|
|
||||||
|
@ -11,13 +11,14 @@
|
|||||||
|
|
||||||
#include "PhraseDictionarySCFG.h"
|
#include "PhraseDictionarySCFG.h"
|
||||||
|
|
||||||
namespace Moses {
|
namespace Moses
|
||||||
|
{
|
||||||
|
|
||||||
class PhraseDictionaryHiero : public PhraseDictionarySCFG
|
class PhraseDictionaryHiero : public PhraseDictionarySCFG
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
PhraseDictionaryHiero(size_t numScoreComponent, PhraseDictionaryFeature* feature)
|
PhraseDictionaryHiero(size_t numScoreComponent, PhraseDictionaryFeature* feature)
|
||||||
: PhraseDictionarySCFG(numScoreComponent,feature) {}
|
: PhraseDictionarySCFG(numScoreComponent,feature) {}
|
||||||
|
|
||||||
bool Load(const std::vector<FactorType> &input
|
bool Load(const std::vector<FactorType> &input
|
||||||
, const std::vector<FactorType> &output
|
, const std::vector<FactorType> &output
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user