[remotefilelog] initial checkin of a c datapack parser

Summary: This is not yet complete, but seems to be able to parse a data file.

Test Plan:
`/Users/tonytung/Library/Caches/CLion2016.2/cmake/generated/cdatapack-64b7828e/64b7828e/Debug/cdatapack_dump d864669a5651d04505ec6e5e9dba1319cde71f7b > /tmp/2`

compare it with the output of `hg debugdatapack --long d864669a5651d04505ec6e5e9dba1319cde71f7b > /tmp/1`

and it exactly matches.

Reviewers: durham

Reviewed By: durham

Subscribers: mitrandir

Differential Revision: https://phabricator.intern.facebook.com/D3627122

Signature: t1:3627122:1470085301:c9b9e8b2fa57bb7a09dd56d3c811ff8eadbb85ba
This commit is contained in:
Tony Tung 2016-08-01 14:05:37 -07:00
parent 9e557758b0
commit 705c0731b6
8 changed files with 741 additions and 0 deletions

View File

@ -0,0 +1,27 @@
# Copyright 2016-present Facebook. All Rights Reserved.
#
# Build file.
#
# no-check-code
cmake_minimum_required(VERSION 3.5)
project(cdatapack)
SET(CMAKE_C_FLAGS "-std=c99 -Wall -Wshorten-64-to-32 -Wincompatible-pointer-types-discards-qualifiers -Werror")
SET(CMAKE_C_FLAGS_DEBUG "-O0 -g")
SET(CMAKE_C_FLAGS_RELEASE "-O3 -DNDEBUG")
SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O0 -g")
add_library(cdatapack
buffer.h
cdatapack.c
cdatapack.h
convert.h)
add_executable(cdatapack_dump cdatapack_dump.c)
target_link_libraries(cdatapack_dump cdatapack)
add_executable(cdatapack_get cdatapack_get.c)
target_link_libraries(cdatapack_get cdatapack)
add_executable(null_test null_test.c)

View File

@ -0,0 +1,47 @@
// Copyright 2016-present Facebook. All Rights Reserved.
//
// buffer.c: declarations for a generic mechanism to expand a heap-allocated
// buffer. this is for internal use only.
//
// no-check-code
#ifndef __FASTMANIFEST_BUFFER_H__
#define __FASTMANIFEST_BUFFER_H__
#include <stdbool.h>
#include <stddef.h>
#include <stdlib.h>
static inline bool expand_to_fit(
void **buffer, size_t num_slots_used, size_t *num_slots_total,
size_t input_count, size_t item_sz,
const float factor,
const size_t min_increment,
const size_t max_increment) {
size_t remaining = *num_slots_total - num_slots_used;
if (input_count > remaining) {
// need realloc
size_t new_slots_total = factor * ((float) *num_slots_total);
if (new_slots_total < min_increment + *num_slots_total) {
new_slots_total = min_increment + *num_slots_total;
}
if (new_slots_total > max_increment + *num_slots_total) {
new_slots_total = max_increment + *num_slots_total;
}
if (new_slots_total < input_count + *num_slots_total) {
new_slots_total = input_count + *num_slots_total;
}
void *newbuffer = realloc(*buffer, item_sz * new_slots_total);
if (newbuffer == NULL) {
return false;
}
*buffer = newbuffer;
*num_slots_total = new_slots_total;
}
return true;
}
#endif /* __FASTMANIFEST_BUFFER_H__ */

View File

@ -0,0 +1,412 @@
// Copyright 2016-present Facebook. All Rights Reserved.
//
// cdatapack.c: Datapack implementation in C.
//
// no-check-code
#include <fcntl.h>
#include <memory.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/mman.h>
#include <errno.h>
#include "cdatapack.h"
#include "buffer.h"
/**
* This is an exact representation of an index entry on disk. Do not consume
* the fields directly, as they may need processing.
*
* NOTE: this uses gcc's __attribute__((packed)) syntax to indicate a packed
* data structure, which obviously has potential portability issues.
*/
typedef struct _disk_index_entry_t {
uint8_t node[NODE_SZ];
// offset of the next element in the delta chain in the index file
index_offset_t deltabase_index_offset;
// offset and size of this current element in the delta chain in the data
// file.
data_offset_t data_offset;
data_offset_t data_sz;
} __attribute__((packed)) disk_index_entry_t;
/**
* This represents offsets into the index indicating the range of a fanout
* bucket. This is calculated upon opening the file.
*/
typedef struct _fanout_table_entry_t {
index_offset_t start_index;
index_offset_t end_index;
} fanout_table_entry_t;
/**
* This is a post-processed index entry. The node pointer is valid only if
* the handle that generated this entry hasn't been closed.
*
* This is the counterpart of disk_index_entry_t.
*/
typedef struct _pack_index_entry_t {
const uint8_t *node;
// offset and size of this current element in the delta chain in the data
// file.
data_offset_t data_offset;
data_offset_t data_sz;
// offset of the next element in the delta chain in the index file
index_offset_t deltabase_index_offset;
} pack_index_entry_t;
/**
* This is a chain of index entries.
*/
typedef struct _pack_chain_t {
pack_index_entry_t *pack_chain_links;
size_t links_idx;
size_t links_sz;
} pack_chain_t;
/**
* This is an exact representation of an index file's header on disk. Do not
* consume the fields directly, as they may need processing.
*
* NOTE: this uses gcc's __attribute__((packed)) syntax to indicate a packed
* data structure, which obviously has potential portability issues.
*/
typedef struct _disk_index_header_t {
#define VERSION 0
uint8_t version;
#define LARGE_FANOUT 0x80
uint8_t config;
} __attribute__((packed)) disk_index_header_t;
static void unpack_disk_deltachunk(
const disk_index_entry_t *disk_deltachunk,
pack_index_entry_t *packindex) {
packindex->node = disk_deltachunk->node;
packindex->data_offset = ntoh_data_offset(
disk_deltachunk->data_offset);
packindex->data_sz = ntoh_data_offset(
disk_deltachunk->data_sz);
packindex->deltabase_index_offset = ntoh_index_offset(
disk_deltachunk->deltabase_index_offset);
}
static bool find(
const datapack_handle_t * handle,
uint8_t node[NODE_SZ],
pack_index_entry_t *packindex) {
uint16_t fanout_idx;
if (handle->large_fanout) {
uint16_t* fanout_idx_ptr = (uint16_t*) &node[0];
fanout_idx = ntohs(*fanout_idx_ptr);
} else {
fanout_idx = node[0];
}
index_offset_t start = handle->fanout_table[fanout_idx].start_index,
end = handle->fanout_table[fanout_idx].end_index;
// indices are INCLUSIVE, so the search is <=
while (start <= end) {
index_offset_t middle = start + ((end - start) / 2);
// peek at the hash at that location.
int cmp = memcmp(node, handle->index_table[middle].node, NODE_SZ);
if (cmp < 0) {
end = middle - 1;
} else if (cmp > 0) {
start = middle + 1;
} else {
// exact match!
unpack_disk_deltachunk(&handle->index_table[middle], packindex);
return true;
}
}
// nope, no good.
return false;
}
datapack_handle_t *open_datapack(
char *indexfp, size_t indexfp_sz,
char *datafp, size_t datafp_sz) {
datapack_handle_t *handle = NULL;
char *buffer = NULL;
handle = malloc(sizeof(datapack_handle_t));
if (handle == NULL) {
// TODO: at some future point in time, it might be nice to add some
// better error reporting like we have in cfastmanifest.
goto error_cleanup;
}
// can't just use memset because MAP_FAILED is the error result code, not
// NULL.
memset(handle, 0, sizeof(datapack_handle_t));
handle->data_mmap = MAP_FAILED;
handle->index_mmap = MAP_FAILED;
buffer = malloc(1 + (indexfp_sz > datafp_sz ? indexfp_sz : datafp_sz));
if (buffer == NULL) {
goto error_cleanup;
}
memcpy(buffer, indexfp, indexfp_sz);
buffer[indexfp_sz] = '\0';
handle->indexfd = open(buffer, O_RDONLY);
if (handle->indexfd < 0) {
goto error_cleanup;
}
handle->index_file_sz = lseek(handle->indexfd, 0, SEEK_END);
lseek(handle->indexfd, 0, SEEK_SET);
memcpy(buffer, datafp, datafp_sz);
buffer[datafp_sz] = '\0';
handle->datafd = open(buffer, O_RDONLY);
if (handle->datafd < 0) {
goto error_cleanup;
}
handle->data_file_sz = lseek(handle->datafd, 0, SEEK_END);
lseek(handle->datafd, 0, SEEK_SET);
handle->index_mmap = mmap(NULL, (size_t) handle->index_file_sz, PROT_READ,
MAP_FILE | MAP_PRIVATE, handle->indexfd, (off_t) 0);
if (handle->index_mmap == MAP_FAILED) {
int er = errno;
(void) er;
goto error_cleanup;
}
handle->data_mmap = mmap(NULL, (size_t) handle->data_file_sz, PROT_READ,
MAP_FILE | MAP_PRIVATE, handle->datafd, (off_t) 0);
if (handle->data_mmap == MAP_FAILED) {
goto error_cleanup;
}
// read the headers and ensure that the file length is at least somewhat
// sane.
if (handle->index_file_sz < sizeof(disk_index_header_t)) {
goto error_cleanup;
}
const disk_index_header_t *header = (const disk_index_header_t *)
handle->index_mmap;
if (header->version != VERSION) {
goto error_cleanup;
}
handle->large_fanout = ((header->config | LARGE_FANOUT) != 0);
int fanout_count = 1 << (handle->large_fanout ? 16 : 8);
handle->fanout_table = (fanout_table_entry_t *) calloc(
fanout_count, sizeof(fanout_table_entry_t));
if (handle->fanout_table == NULL) {
goto error_cleanup;
}
handle->index_table = (disk_index_entry_t *)
(((const char *) handle->index_mmap) +
sizeof(disk_index_header_t) +
(sizeof(index_offset_t) * fanout_count));
disk_index_entry_t *index_end = (disk_index_entry_t *)
(((const char *) handle->index_mmap) + handle->index_file_sz);
if (handle->index_table > index_end) {
// ensure the file is at least big enough to include the fanout table.
goto error_cleanup;
}
// build a clean and easy table to bisect.
index_offset_t *index = (index_offset_t *)
(((const char *) handle->index_mmap) +
sizeof(disk_index_header_t));
index_offset_t prev_index_offset = 0;
int last_fanout_increment = 0;
for (int ix = 0; ix < fanout_count; ix++) {
index_offset_t index_offset = ntoh_index_offset(index[ix]);
if (index_offset != prev_index_offset) {
// backfill the start & end offsets
for (int jx = last_fanout_increment; jx < ix; jx ++) {
// fill the "start" except for the last time we changed the index
// offset.
if (jx != last_fanout_increment) {
handle->fanout_table[jx].start_index = index_offset;
}
handle->fanout_table[jx].end_index = index_offset;
}
handle->fanout_table[ix].start_index = index_offset;
last_fanout_increment = ix;
prev_index_offset = index_offset;
}
}
// we may need to backfill the remaining offsets.
index_offset_t last_offset = (index_offset_t)
(index_end - handle->index_table - 1);
for (int jx = last_fanout_increment; jx < fanout_count; jx ++) {
// fill the "start" except for the last time we changed the index
// offset.
if (jx != last_fanout_increment) {
handle->fanout_table[jx].start_index = last_offset;
}
handle->fanout_table[jx].end_index = last_offset;
}
goto success_cleanup;
error_cleanup:
if (handle->index_mmap != MAP_FAILED) {
munmap(handle->index_mmap, handle->index_file_sz);
}
if (handle->data_mmap != MAP_FAILED) {
munmap(handle->data_mmap, handle->data_file_sz);
}
if (handle && handle->indexfd != 0) {
close(handle->indexfd);
}
if (handle && handle->datafd != 0) {
close(handle->datafd);
}
free(handle->index_table);
free(handle);
handle = NULL;
success_cleanup:
free(buffer);
return handle;
}
void close_datapack(datapack_handle_t *handle) {
munmap(handle->index_mmap, handle->index_file_sz);
munmap(handle->data_mmap, handle->data_file_sz);
close(handle->indexfd);
close(handle->datafd);
free(handle->index_table);
free(handle);
}
#define DEFAULT_PACK_CHAIN_CAPACITY 64
#define PACK_CHAIN_GROWTH_FACTOR 2.0
#define PACK_CHAIN_MINIMUM_GROWTH 1024
#define PACK_CHAIN_MAXIMUM_GROWTH 65536
#define PACK_CHAIN_EXPAND_TO_FIT(buffer, buffer_idx, buffer_sz) \
expand_to_fit(buffer, buffer_idx, buffer_sz, \
1, sizeof(pack_index_entry_t), \
PACK_CHAIN_GROWTH_FACTOR, \
PACK_CHAIN_MINIMUM_GROWTH, \
PACK_CHAIN_MAXIMUM_GROWTH)
static pack_chain_t *build_pack_chain(
const datapack_handle_t *handle,
uint8_t node[NODE_SZ]) {
pack_chain_t *result = malloc(sizeof(pack_chain_t));
result->links_idx = 0;
result->links_sz = DEFAULT_PACK_CHAIN_CAPACITY;
result->pack_chain_links = malloc(
result->links_sz * sizeof(pack_index_entry_t));
// TODO: error handling.
pack_index_entry_t entry;
// find the first entry.
if (find(handle, node, &entry) == false) {
return NULL;
}
PACK_CHAIN_EXPAND_TO_FIT(
(void **)&result->pack_chain_links,
result->links_idx,
&result->links_sz);
// TODO: yeah, this desperately needs some error handling.
result->pack_chain_links[result->links_idx] = entry;
while (entry.deltabase_index_offset != FULLTEXTINDEXMARK &&
entry.deltabase_index_offset != NOBASEINDEXMARK) {
unpack_disk_deltachunk(
&handle->index_table[entry.deltabase_index_offset], &entry);
PACK_CHAIN_EXPAND_TO_FIT(
(void **)&result->pack_chain_links,
result->links_idx,
&result->links_sz);
// TODO: yeah, this desperately needs some error handling.
result->pack_chain_links[result->links_idx] = entry;
}
return result;
}
const uint8_t *getdeltachainlink(
const uint8_t *ptr, delta_chain_link_t *link) {
link->filename_sz = ntohs(*((uint16_t *) ptr));
ptr += sizeof(uint16_t);
link->filename = (const char *) ptr;
ptr += link->filename_sz;
link->node = ptr;
ptr += NODE_SZ;
link->deltabase_node = ptr;
ptr += NODE_SZ;
link->delta_sz = ntohll(*((uint64_t *) ptr));
ptr += sizeof(uint64_t);
link->delta = ptr;
ptr += link->delta_sz;
return ptr;
}
delta_chain_t *getdeltachain(
const datapack_handle_t *handle,
uint8_t node[NODE_SZ]) {
pack_chain_t *pack_chain = build_pack_chain(handle, node);
// TODO: error handling
delta_chain_t *delta_chain = malloc(sizeof(delta_chain_t));
delta_chain->links_count = pack_chain->links_idx;
delta_chain->delta_chain_links = malloc(
delta_chain->links_count * sizeof(delta_chain_link_t));
// TODO: error handling
for (int ix = 0; ix < pack_chain->links_sz; ix ++) {
const uint8_t *ptr = (const uint8_t *)
pack_chain->pack_chain_links[ix].data_offset;
const uint8_t *end = ptr +
pack_chain->pack_chain_links[ix].data_sz;
delta_chain_link_t *link = &delta_chain->delta_chain_links[ix];
ptr = getdeltachainlink(ptr, link);
if (ptr > end) {
abort();
}
}
// free pack chain.
if (pack_chain != NULL) {
free(pack_chain->pack_chain_links);
free(pack_chain);
}
return delta_chain;
}

View File

@ -0,0 +1,94 @@
// Copyright 2016-present Facebook. All Rights Reserved.
//
// cdatapack:
//
// no-check-code
#ifndef CDATAPACK_CDATAPACK_H
#define CDATAPACK_CDATAPACK_H
#include <stdint.h>
#include <stddef.h>
#include <stdbool.h>
#define NODE_SZ 20
typedef uint32_t index_offset_t;
#define ntoh_index_offset ntohl
#define FULLTEXTINDEXMARK ((index_offset_t) -1)
#define NOBASEINDEXMARK ((index_offset_t) -2)
typedef uint64_t data_offset_t;
#define ntoh_data_offset ntohll
struct _disk_index_entry_t;
struct _fanout_table_entry_t;
typedef struct _datapack_handle_t {
int indexfd;
int datafd;
void* index_mmap;
void* data_mmap;
off_t index_file_sz;
off_t data_file_sz;
bool large_fanout;
// this is the computed fanout table.
struct _fanout_table_entry_t *fanout_table;
// this points to the first index entry.
struct _disk_index_entry_t* index_table;
// this points to the entry one past the last.
struct _disk_index_entry_t* index_end;
} datapack_handle_t;
/**
* This represents a single entry in a delta chain.
*/
typedef struct _delta_chain_link_t {
uint16_t filename_sz;
const char *filename;
const uint8_t *node;
const uint8_t *deltabase_node;
data_offset_t delta_sz;
const uint8_t *delta;
} delta_chain_link_t;
/**
* This represents an entire delta chain.
*/
typedef struct _delta_chain_t {
delta_chain_link_t *delta_chain_links;
size_t links_count;
} delta_chain_t;
/**
* Open a datapack + index file. The fanout table is read and processed at
* this point.
*
* Returns a handle for subsequent operations.
*/
extern datapack_handle_t *open_datapack(
char *indexfp, size_t indexfp_sz,
char *datafp, size_t datafp_sz);
/**
* Release a datapack + index file handle.
*/
extern void close_datapack(datapack_handle_t *);
/**
* Retrieves a delta chain for a given node.
*/
extern delta_chain_t *getdeltachain(
const datapack_handle_t *handle,
uint8_t node[NODE_SZ]);
// this should really be private, but we need it for the cdatapack_dump tool.
extern const uint8_t *getdeltachainlink(
const uint8_t *ptr, delta_chain_link_t *link);
#endif //CDATAPACK_CDATAPACK_H

View File

@ -0,0 +1,69 @@
// Copyright 2016-present Facebook. All Rights Reserved.
//
// cdatapack_dump.c: Dump the entire contents of a datapack file by walking
// the datapack file.
//
// no-check-code
#include <memory.h>
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
#include "convert.h"
#include "cdatapack.h"
#define DATAIDX_EXT ".dataidx"
#define DATAPACK_EXT ".datapack"
int main(int argc, char *argv[]) {
if (argc < 2) {
fprintf(stderr, "%s <path>\n", argv[0]);
exit(1);
}
long len = strlen(argv[1]);
char idx_path[len + sizeof(DATAIDX_EXT)];
char data_path[len + sizeof(DATAPACK_EXT)];
sprintf(idx_path, "%s%s", argv[1], DATAIDX_EXT);
sprintf(data_path, "%s%s", argv[1], DATAPACK_EXT);
datapack_handle_t *handle = open_datapack(
idx_path, strlen(idx_path),
data_path, strlen(data_path));
const uint8_t *ptr = handle->data_mmap;
const uint8_t *end = ptr + handle->data_file_sz;
ptr += 1; // for the version field.
const char *last_filename = NULL;
uint16_t last_filename_sz = 0;
char node_buffer[NODE_SZ * 2];
char deltabase_buffer[NODE_SZ * 2];
while (ptr < end) {
delta_chain_link_t link;
ptr = getdeltachainlink(ptr, &link);
if (last_filename_sz != link.filename_sz ||
memcmp(last_filename, link.filename, last_filename_sz) != 0) {
// print the filename
printf("\n%-.*s\n", (int) link.filename_sz, link.filename);
last_filename_sz = link.filename_sz;
last_filename = link.filename;
}
hexlify(link.node, NODE_SZ, node_buffer);
hexlify(link.deltabase_node, NODE_SZ, deltabase_buffer);
printf("%-*s %-*s %s\n",
NODE_SZ * 2, "Node", NODE_SZ * 2, "Delta Base",
"Delta Length");
printf("%-.*s %-.*s %" PRIu64 "\n",
NODE_SZ * 2, node_buffer, NODE_SZ * 2, deltabase_buffer,
link.delta_sz);
}
}

View File

@ -0,0 +1,8 @@
//
// cdatapack_get:
//
// no-check-code
int main() {
return 0;
}

View File

@ -0,0 +1,75 @@
// Copyright 2016-present Facebook. All Rights Reserved.
//
// convert.h: hex-string conversions
//
// no-check-code
#ifndef __FASTMANIFEST_CONVERT_H__
#define __FASTMANIFEST_CONVERT_H__
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
static int8_t hextable[256] = {
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, /* 0-9 */
-1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* A-F */
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* a-f */
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
};
static char chartable[16] = {
'0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
};
/*
* Turn a hex-encoded string into binary. Returns false on failure.
*/
static inline bool unhexlify(const char *input, int len, uint8_t *dst) {
if (len % 2 != 0) {
// wtf.
return false;
}
for (size_t ix = 0; ix < len; ix += 2, dst++) {
int hi = hextable[(unsigned char) input[ix]];
int lo = hextable[(unsigned char) input[ix + 1]];
if (hi < 0 || lo < 0) {
return false;
}
*dst = (hi << 4) | lo;
}
return true;
}
/*
* Turn binary data into a hex-encoded string.
*/
static inline void hexlify(const uint8_t *input, int len, char *dst) {
for (size_t ix = 0; ix < len; ix++, dst += 2) {
unsigned char ch = (unsigned char) input[ix];
char hi = chartable[ch >> 4];
char lo = chartable[ch & 0xf];
*dst = hi;
*(dst + 1) = lo;
}
}
#endif /* #ifndef __FASTMANIFEST_CONVERT_H__ */

View File

@ -0,0 +1,9 @@
// Copyright 2016-present Facebook. All Rights Reserved.
//
// null_test.c: garbage test to make CLion happy.
//
// no-check-code
int main(int argc, char *argv[]) {
return 0;
}