diff --git a/cbits/BeginPrivate.h b/cbits/BeginPrivate.h new file mode 100644 index 0000000..e69de29 diff --git a/cbits/EndPrivate.h b/cbits/EndPrivate.h new file mode 100644 index 0000000..e69de29 diff --git a/cbits/GHCFunctions.h b/cbits/GHCFunctions.h index d85bed2..cb8aaca 100644 --- a/cbits/GHCFunctions.h +++ b/cbits/GHCFunctions.h @@ -31,3 +31,10 @@ extern void stgFree(void* p); #ifdef DEBUG extern void checkClosure(StgClosure*); #endif + +#if __GLASGOW_HASKELL__ >= 801 +// we have to bring the internal HEAP_ALLOCED macro in scope +// to be able to deal with CONSTR_NOCAF (to detect static ones) +// This header file is taken from GHC 8.2 source code directly +# include "GhcHeapAlloc.h" +#endif diff --git a/cbits/GhcHeapAlloc.h b/cbits/GhcHeapAlloc.h new file mode 100644 index 0000000..0ec1e6d --- /dev/null +++ b/cbits/GhcHeapAlloc.h @@ -0,0 +1,227 @@ +/* ----------------------------------------------------------------------------- + * + * (c) The University of Glasgow 2006-2008 + * + * The HEAP_ALLOCED() test. + * + * ---------------------------------------------------------------------------*/ + +#ifndef SM_HEAP_ALLOC_H +#define SM_HEAP_ALLOC_H + +#include "BeginPrivate.h" + +/* ----------------------------------------------------------------------------- + The HEAP_ALLOCED() test. + + HEAP_ALLOCED is called FOR EVERY SINGLE CLOSURE during GC. + It needs to be FAST. + + See wiki commentary at + http://ghc.haskell.org/trac/ghc/wiki/Commentary/HeapAlloced + + Implementation of HEAP_ALLOCED + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + Since heap is allocated in chunks of megablocks (MBLOCK_SIZE), we + can just use a table to record which megablocks in the address + space belong to the heap. On a 32-bit machine, with 1Mb + megablocks, using 8 bits for each entry in the table, the table + requires 4k. Lookups during GC will be fast, because the table + will be quickly cached (indeed, performance measurements showed no + measurable difference between doing the table lookup and using a + constant comparison). + + On 64-bit machines, we have two possibilities. One is to request + a single chunk of address space that we deem "large enough" + (currently 1TB or the ulimit size, whichever is smaller, although this could + easily be extended to, say 16TB or more). Memory from that chunk is GC + memory, everything else is not. This case is tricky in that it requires + support from the OS to allocate address space without allocating memory (in + practice, all modern OSes do this). It's also tricky in that it is the only + case where a successful HEAP_ALLOCED(p) check can trigger a segfault when + accessing p (and for debugging purposes, it will). + + Alternatively, the older implementation caches one 12-bit block map + that describes 4096 megablocks or 4GB of memory. If HEAP_ALLOCED is + called for an address that is not in the cache, it calls + slowIsHeapAlloced (see MBlock.c) which will find the block map for + the 4GB block in question. + -------------------------------------------------------------------------- */ + +#ifdef USE_LARGE_ADDRESS_SPACE + +struct mblock_address_range { + W_ begin, end; + W_ padding[6]; // ensure nothing else inhabits this cache line +} ATTRIBUTE_ALIGNED(64); +extern struct mblock_address_range mblock_address_space; + +# define HEAP_ALLOCED(p) ((W_)(p) >= mblock_address_space.begin && \ + (W_)(p) < (mblock_address_space.end)) +# define HEAP_ALLOCED_GC(p) HEAP_ALLOCED(p) + +#elif SIZEOF_VOID_P == 4 +extern StgWord8 mblock_map[]; + +/* On a 32-bit machine a 4KB table is always sufficient */ +# define MBLOCK_MAP_SIZE 4096 +# define MBLOCK_MAP_ENTRY(p) ((StgWord)(p) >> MBLOCK_SHIFT) +# define HEAP_ALLOCED(p) mblock_map[MBLOCK_MAP_ENTRY(p)] +# define HEAP_ALLOCED_GC(p) HEAP_ALLOCED(p) + +/* ----------------------------------------------------------------------------- + HEAP_ALLOCED for 64-bit machines (without LARGE_ADDRESS_SPACE). + + Here are some cache layout options: + + [1] + 16KB cache of 16-bit entries, 1MB lines (capacity 8GB) + mblock size = 20 bits + entries = 8192 13 bits + line size = 0 bits (1 bit of value) + tag size = 15 bits + = 48 bits + + [2] + 32KB cache of 16-bit entries, 4MB lines (capacity 32GB) + mblock size = 20 bits + entries = 16384 14 bits + line size = 2 bits (4 bits of value) + tag size = 12 bits + = 48 bits + + [3] + 16KB cache of 16-bit entries, 2MB lines (capacity 16GB) + mblock size = 20 bits + entries = 8192 13 bits + line size = 1 bits (2 bits of value) + tag size = 14 bits + = 48 bits + + [4] + 4KB cache of 32-bit entries, 16MB lines (capacity 16GB) + mblock size = 20 bits + entries = 1024 10 bits + line size = 4 bits (16 bits of value) + tag size = 14 bits + = 48 bits + + [5] + 4KB cache of 64-bit entries, 32MB lines (capacity 16GB) + mblock size = 20 bits + entries = 512 9 bits + line size = 5 bits (32 bits of value) + tag size = 14 bits + = 48 bits + + We actually use none of the above. After much experimentation it was + found that optimising the lookup is the most important factor, + followed by reducing the number of misses. To that end, we use a + variant of [1] in which each cache entry is ((mblock << 1) + value) + where value is 0 for non-heap and 1 for heap. The cache entries can + be 32 bits, since the mblock number is 48-20 = 28 bits, and we need + 1 bit for the value. The cache can be as big as we like, but + currently we use 8k entries, giving us 8GB capacity. + + ---------------------------------------------------------------------------- */ + +#elif SIZEOF_VOID_P == 8 + +#define MBC_LINE_BITS 0 +#define MBC_TAG_BITS 15 + +#if x86_64_HOST_ARCH +// 32bits are enough for 'entry' as modern amd64 boxes have +// only 48bit sized virtual addres. +typedef StgWord32 MbcCacheLine; +#else +// 32bits is not enough here as some arches (like ia64) use +// upper address bits to distinct memory areas. +typedef StgWord64 MbcCacheLine; +#endif + +typedef StgWord8 MBlockMapLine; + +#define MBLOCK_MAP_LINE(p) (((StgWord)p & 0xffffffff) >> (MBLOCK_SHIFT + MBC_LINE_BITS)) + +#define MBC_LINE_SIZE (1<> (MBLOCK_SHIFT + MBC_LINE_BITS)) + +#define MBLOCK_MAP_ENTRIES (1 << (32 - MBLOCK_SHIFT - MBC_LINE_BITS)) + +typedef struct { + StgWord32 addrHigh32; + MBlockMapLine lines[MBLOCK_MAP_ENTRIES]; +} MBlockMap; + +extern W_ mpc_misses; + +StgBool HEAP_ALLOCED_miss(StgWord mblock, const void *p); + +INLINE_HEADER +StgBool HEAP_ALLOCED(const void *p) +{ + StgWord mblock; + uint32_t entry_no; + MbcCacheLine entry, value; + + mblock = (StgWord)p >> MBLOCK_SHIFT; + entry_no = mblock & (MBC_ENTRIES-1); + entry = mblock_cache[entry_no]; + value = entry ^ (mblock << 1); + // this formulation coaxes gcc into prioritising the value==1 + // case, which we expect to be the most common. + // __builtin_expect() didn't have any useful effect (gcc-4.3.0). + if (value == 1) { + return 1; + } else if (value == 0) { + return 0; + } else { + // putting the rest out of line turned out to be a slight + // performance improvement: + return HEAP_ALLOCED_miss(mblock,p); + } +} + +// In the parallel GC, the cache itself is safe to *read*, and can be +// updated atomically, but we need to place a lock around operations +// that touch the MBlock map. +INLINE_HEADER +StgBool HEAP_ALLOCED_GC(void *p) +{ + StgWord mblock; + uint32_t entry_no; + MbcCacheLine entry, value; + StgBool b; + + mblock = (StgWord)p >> MBLOCK_SHIFT; + entry_no = mblock & (MBC_ENTRIES-1); + entry = mblock_cache[entry_no]; + value = entry ^ (mblock << 1); + if (value == 1) { + return 1; + } else if (value == 0) { + return 0; + } else { + // putting the rest out of line turned out to be a slight + // performance improvement: + ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync); + b = HEAP_ALLOCED_miss(mblock,p); + RELEASE_SPIN_LOCK(&gc_alloc_block_sync); + return b; + } +} + +#else +# error HEAP_ALLOCED not defined +#endif + +#include "EndPrivate.h" + +#endif /* SM_HEAP_ALLOC_H */ diff --git a/cbits/Types.h b/cbits/Types.h index 48169ee..160c86d 100644 --- a/cbits/Types.h +++ b/cbits/Types.h @@ -10,7 +10,7 @@ typedef struct pmPackBuffer_ { // for data messages only, StgInt /* nat */ size; StgInt /* nat */ unpacked_size; - StgWord buffer[FLEXIBLE_ARRAY]; + StgWord buffer[]; } pmPackBuffer;