unleashed-firmware/lib/mjs/mjs_string.c

602 lines
17 KiB
C
Raw Normal View History

/*
* Copyright (c) 2017 Cesanta Software Limited
* All rights reserved
*/
#include "mjs_string.h"
#include "common/cs_varint.h"
#include "common/mg_str.h"
#include "mjs_core.h"
#include "mjs_internal.h"
#include "mjs_primitive.h"
#include "mjs_util.h"
// No UTF
typedef unsigned short Rune;
static int chartorune(Rune* rune, const char* str) {
*rune = *(unsigned char*)str;
return 1;
}
static int runetochar(char* str, Rune* rune) {
str[0] = (char)*rune;
return 1;
}
#ifndef MJS_STRING_BUF_RESERVE
#define MJS_STRING_BUF_RESERVE 100
#endif
MJS_PRIVATE size_t unescape(const char* s, size_t len, char* to);
MJS_PRIVATE void embed_string(
struct mbuf* m,
size_t offset,
const char* p,
size_t len,
uint8_t /*enum embstr_flags*/ flags);
/* TODO(lsm): NaN payload location depends on endianness, make crossplatform */
#define GET_VAL_NAN_PAYLOAD(v) ((char*)&(v))
int mjs_is_string(mjs_val_t v) {
uint64_t t = v & MJS_TAG_MASK;
return t == MJS_TAG_STRING_I || t == MJS_TAG_STRING_F || t == MJS_TAG_STRING_O ||
t == MJS_TAG_STRING_5 || t == MJS_TAG_STRING_D;
}
mjs_val_t mjs_mk_string(struct mjs* mjs, const char* p, size_t len, int copy) {
struct mbuf* m;
mjs_val_t offset, tag = MJS_TAG_STRING_F;
if(len == 0) {
/*
* Zero length for foreign string has a special meaning (that the foreign
* string is not inlined into mjs_val_t), so when creating a zero-length
* string, we always assume it'll be owned. Since the length is zero, it
* doesn't matter anyway.
*/
copy = 1;
}
m = copy ? &mjs->owned_strings : &mjs->foreign_strings;
offset = m->len;
if(len == ~((size_t)0)) len = strlen(p);
if(copy) {
/* owned string */
if(len <= 4) {
char* s = GET_VAL_NAN_PAYLOAD(offset) + 1;
offset = 0;
if(p != 0) {
memcpy(s, p, len);
}
s[-1] = len;
tag = MJS_TAG_STRING_I;
} else if(len == 5) {
char* s = GET_VAL_NAN_PAYLOAD(offset);
offset = 0;
if(p != 0) {
memcpy(s, p, len);
}
tag = MJS_TAG_STRING_5;
// } else if ((dict_index = v_find_string_in_dictionary(p, len)) >= 0) {
// offset = 0;
// GET_VAL_NAN_PAYLOAD(offset)[0] = dict_index;
// tag = MJS_TAG_STRING_D;
} else {
if(gc_strings_is_gc_needed(mjs)) {
mjs->need_gc = 1;
}
/*
* Before embedding new string, check if the reallocation is needed. If
* so, perform the reallocation by calling `mbuf_resize` manually, since
* we need to preallocate some extra space (`MJS_STRING_BUF_RESERVE`)
*/
if((m->len + len) > m->size) {
char* prev_buf = m->buf;
mbuf_resize(m, m->len + len + MJS_STRING_BUF_RESERVE);
/*
* There is a corner case: when the source pointer is located within
* the mbuf. In this case, we should adjust the pointer, because it
* might have just been reallocated.
*/
if(p >= prev_buf && p < (prev_buf + m->len)) {
p += (m->buf - prev_buf);
}
}
embed_string(m, m->len, p, len, EMBSTR_ZERO_TERM);
tag = MJS_TAG_STRING_O;
}
} else {
/* foreign string */
if(sizeof(void*) <= 4 && len <= (1 << 15)) {
/* small foreign strings can fit length and ptr in the mjs_val_t */
offset = (uint64_t)len << 32 | (uint64_t)(uintptr_t)p;
} else {
/* bigger strings need indirection that uses ram */
size_t pos = m->len;
size_t llen = cs_varint_llen(len);
/* allocate space for len and ptr */
mbuf_insert(m, pos, NULL, llen + sizeof(p));
cs_varint_encode(len, (uint8_t*)(m->buf + pos), llen);
memcpy(m->buf + pos + llen, &p, sizeof(p));
}
tag = MJS_TAG_STRING_F;
}
/* NOTE(lsm): don't use pointer_to_value, 32-bit ptrs will truncate */
return (offset & ~MJS_TAG_MASK) | tag;
}
/* Get a pointer to string and string length. */
const char* mjs_get_string(struct mjs* mjs, mjs_val_t* v, size_t* sizep) {
uint64_t tag = v[0] & MJS_TAG_MASK;
const char* p = NULL;
size_t size = 0, llen;
if(!mjs_is_string(*v)) {
goto clean;
}
if(tag == MJS_TAG_STRING_I) {
p = GET_VAL_NAN_PAYLOAD(*v) + 1;
size = p[-1];
} else if(tag == MJS_TAG_STRING_5) {
p = GET_VAL_NAN_PAYLOAD(*v);
size = 5;
// } else if (tag == MJS_TAG_STRING_D) {
// int index = ((unsigned char *) GET_VAL_NAN_PAYLOAD(*v))[0];
// size = v_dictionary_strings[index].len;
// p = v_dictionary_strings[index].p;
} else if(tag == MJS_TAG_STRING_O) {
size_t offset = (size_t)gc_string_mjs_val_to_offset(*v);
char* s = mjs->owned_strings.buf + offset;
uint64_t v = 0;
if(offset < mjs->owned_strings.len &&
cs_varint_decode((uint8_t*)s, mjs->owned_strings.len - offset, &v, &llen)) {
size = v;
p = s + llen;
} else {
goto clean;
}
} else if(tag == MJS_TAG_STRING_F) {
/*
* short foreign strings on <=32-bit machines can be encoded in a compact
* form:
*
* 7 6 5 4 3 2 1 0
* 11111111|1111tttt|llllllll|llllllll|ssssssss|ssssssss|ssssssss|ssssssss
*
* Strings longer than 2^26 will be indireceted through the foreign_strings
* mbuf.
*
* We don't use a different tag to represent those two cases. Instead, all
* foreign strings represented with the help of the foreign_strings mbuf
* will have the upper 16-bits of the payload set to zero. This allows us to
* represent up to 477 million foreign strings longer than 64k.
*/
uint16_t len = (*v >> 32) & 0xFFFF;
if(sizeof(void*) <= 4 && len != 0) {
size = (size_t)len;
p = (const char*)(uintptr_t)*v;
} else {
size_t offset = (size_t)gc_string_mjs_val_to_offset(*v);
char* s = mjs->foreign_strings.buf + offset;
uint64_t v = 0;
if(offset < mjs->foreign_strings.len &&
cs_varint_decode((uint8_t*)s, mjs->foreign_strings.len - offset, &v, &llen)) {
size = v;
memcpy((char**)&p, s + llen, sizeof(p));
} else {
goto clean;
}
}
} else {
assert(0);
}
clean:
if(sizep != NULL) {
*sizep = size;
}
return p;
}
const char* mjs_get_cstring(struct mjs* mjs, mjs_val_t* value) {
size_t size;
const char* s = mjs_get_string(mjs, value, &size);
if(s == NULL) return NULL;
if(s[size] != 0 || strlen(s) != size) {
return NULL;
}
return s;
}
int mjs_strcmp(struct mjs* mjs, mjs_val_t* a, const char* b, size_t len) {
size_t n;
const char* s;
if(len == (size_t)~0) len = strlen(b);
s = mjs_get_string(mjs, a, &n);
if(n != len) {
return n - len;
}
return strncmp(s, b, len);
}
MJS_PRIVATE unsigned long cstr_to_ulong(const char* s, size_t len, int* ok) {
char* e;
unsigned long res = strtoul(s, &e, 10);
*ok = (e == s + len) && len != 0;
return res;
}
MJS_PRIVATE mjs_err_t str_to_ulong(struct mjs* mjs, mjs_val_t v, int* ok, unsigned long* res) {
enum mjs_err ret = MJS_OK;
size_t len = 0;
const char* p = mjs_get_string(mjs, &v, &len);
*res = cstr_to_ulong(p, len, ok);
return ret;
}
MJS_PRIVATE int s_cmp(struct mjs* mjs, mjs_val_t a, mjs_val_t b) {
size_t a_len, b_len;
const char *a_ptr, *b_ptr;
a_ptr = mjs_get_string(mjs, &a, &a_len);
b_ptr = mjs_get_string(mjs, &b, &b_len);
if(a_len == b_len) {
return memcmp(a_ptr, b_ptr, a_len);
}
if(a_len > b_len) {
return 1;
} else if(a_len < b_len) {
return -1;
} else {
return 0;
}
}
MJS_PRIVATE mjs_val_t s_concat(struct mjs* mjs, mjs_val_t a, mjs_val_t b) {
size_t a_len, b_len, res_len;
const char *a_ptr, *b_ptr, *res_ptr;
mjs_val_t res;
/* Find out lengths of both srtings */
a_ptr = mjs_get_string(mjs, &a, &a_len);
b_ptr = mjs_get_string(mjs, &b, &b_len);
/* Create a placeholder string */
res = mjs_mk_string(mjs, NULL, a_len + b_len, 1);
/* mjs_mk_string() may have reallocated mbuf - revalidate pointers */
a_ptr = mjs_get_string(mjs, &a, &a_len);
b_ptr = mjs_get_string(mjs, &b, &b_len);
/* Copy strings into the placeholder */
res_ptr = mjs_get_string(mjs, &res, &res_len);
memcpy((char*)res_ptr, a_ptr, a_len);
memcpy((char*)res_ptr + a_len, b_ptr, b_len);
return res;
}
MJS_PRIVATE void mjs_string_slice(struct mjs* mjs) {
int nargs = mjs_nargs(mjs);
mjs_val_t ret = mjs_mk_number(mjs, 0);
mjs_val_t beginSlice_v = MJS_UNDEFINED;
mjs_val_t endSlice_v = MJS_UNDEFINED;
int beginSlice = 0;
int endSlice = 0;
size_t size;
const char* s = NULL;
/* get string from `this` */
if(!mjs_check_arg(mjs, -1 /*this*/, "this", MJS_TYPE_STRING, NULL)) {
goto clean;
}
s = mjs_get_string(mjs, &mjs->vals.this_obj, &size);
/* get idx from arg 0 */
if(!mjs_check_arg(mjs, 0, "beginSlice", MJS_TYPE_NUMBER, &beginSlice_v)) {
goto clean;
}
beginSlice = mjs_normalize_idx(mjs_get_int(mjs, beginSlice_v), size);
if(nargs >= 2) {
/* endSlice is given; use it */
/* get idx from arg 0 */
if(!mjs_check_arg(mjs, 1, "endSlice", MJS_TYPE_NUMBER, &endSlice_v)) {
goto clean;
}
endSlice = mjs_normalize_idx(mjs_get_int(mjs, endSlice_v), size);
} else {
/* endSlice is not given; assume the end of the string */
endSlice = size;
}
if(endSlice < beginSlice) {
endSlice = beginSlice;
}
ret = mjs_mk_string(mjs, s + beginSlice, endSlice - beginSlice, 1);
clean:
mjs_return(mjs, ret);
}
MJS_PRIVATE void mjs_string_index_of(struct mjs* mjs) {
mjs_val_t ret = mjs_mk_number(mjs, -1);
mjs_val_t substr_v = MJS_UNDEFINED;
mjs_val_t idx_v = MJS_UNDEFINED;
int idx = 0;
const char *str = NULL, *substr = NULL;
size_t str_len = 0, substr_len = 0;
if(!mjs_check_arg(mjs, -1 /* this */, "this", MJS_TYPE_STRING, NULL)) {
goto clean;
}
str = mjs_get_string(mjs, &mjs->vals.this_obj, &str_len);
if(!mjs_check_arg(mjs, 0, "searchValue", MJS_TYPE_STRING, &substr_v)) {
goto clean;
}
substr = mjs_get_string(mjs, &substr_v, &substr_len);
if(mjs_nargs(mjs) > 1) {
if(!mjs_check_arg(mjs, 1, "fromIndex", MJS_TYPE_NUMBER, &idx_v)) {
goto clean;
}
idx = mjs_get_int(mjs, idx_v);
if(idx < 0) idx = 0;
if((size_t)idx > str_len) idx = str_len;
}
{
const char* substr_p;
struct mg_str mgstr, mgsubstr;
mgstr.p = str + idx;
mgstr.len = str_len - idx;
mgsubstr.p = substr;
mgsubstr.len = substr_len;
substr_p = mg_strstr(mgstr, mgsubstr);
if(substr_p != NULL) {
ret = mjs_mk_number(mjs, (int)(substr_p - str));
}
}
clean:
mjs_return(mjs, ret);
}
MJS_PRIVATE void mjs_string_char_code_at(struct mjs* mjs) {
mjs_val_t ret = MJS_UNDEFINED;
mjs_val_t idx_v = MJS_UNDEFINED;
int idx = 0;
size_t size;
const char* s = NULL;
/* get string from `this` */
if(!mjs_check_arg(mjs, -1 /*this*/, "this", MJS_TYPE_STRING, NULL)) {
goto clean;
}
s = mjs_get_string(mjs, &mjs->vals.this_obj, &size);
/* get idx from arg 0 */
if(!mjs_check_arg(mjs, 0, "index", MJS_TYPE_NUMBER, &idx_v)) {
goto clean;
}
idx = mjs_normalize_idx(mjs_get_int(mjs, idx_v), size);
if(idx >= 0 && idx < (int)size) {
ret = mjs_mk_number(mjs, ((unsigned char*)s)[idx]);
}
clean:
mjs_return(mjs, ret);
}
MJS_PRIVATE void mjs_mkstr(struct mjs* mjs) {
int nargs = mjs_nargs(mjs);
mjs_val_t ret = MJS_UNDEFINED;
char* ptr = NULL;
int offset = 0;
int len = 0;
int copy = 0;
mjs_val_t ptr_v = MJS_UNDEFINED;
mjs_val_t offset_v = MJS_UNDEFINED;
mjs_val_t len_v = MJS_UNDEFINED;
mjs_val_t copy_v = MJS_UNDEFINED;
if(nargs == 2) {
ptr_v = mjs_arg(mjs, 0);
len_v = mjs_arg(mjs, 1);
} else if(nargs == 3) {
ptr_v = mjs_arg(mjs, 0);
offset_v = mjs_arg(mjs, 1);
len_v = mjs_arg(mjs, 2);
} else if(nargs == 4) {
ptr_v = mjs_arg(mjs, 0);
offset_v = mjs_arg(mjs, 1);
len_v = mjs_arg(mjs, 2);
copy_v = mjs_arg(mjs, 3);
} else {
mjs_prepend_errorf(
mjs,
MJS_TYPE_ERROR,
"mkstr takes 2, 3 or 4 arguments: (ptr, len), (ptr, "
"offset, len) or (ptr, offset, len, copy)");
goto clean;
}
if(!mjs_is_foreign(ptr_v)) {
mjs_prepend_errorf(mjs, MJS_TYPE_ERROR, "ptr should be a foreign pointer");
goto clean;
}
if(offset_v != MJS_UNDEFINED && !mjs_is_number(offset_v)) {
mjs_prepend_errorf(mjs, MJS_TYPE_ERROR, "offset should be a number");
goto clean;
}
if(!mjs_is_number(len_v)) {
mjs_prepend_errorf(mjs, MJS_TYPE_ERROR, "len should be a number");
goto clean;
}
copy = mjs_is_truthy(mjs, copy_v);
/* all arguments are fine */
ptr = (char*)mjs_get_ptr(mjs, ptr_v);
if(offset_v != MJS_UNDEFINED) {
offset = mjs_get_int(mjs, offset_v);
}
len = mjs_get_int(mjs, len_v);
ret = mjs_mk_string(mjs, ptr + offset, len, copy);
clean:
mjs_return(mjs, ret);
}
enum unescape_error {
SLRE_INVALID_HEX_DIGIT,
SLRE_INVALID_ESC_CHAR,
SLRE_UNTERM_ESC_SEQ,
};
static int hex(int c) {
if(c >= '0' && c <= '9') return c - '0';
if(c >= 'a' && c <= 'f') return c - 'a' + 10;
if(c >= 'A' && c <= 'F') return c - 'A' + 10;
return -SLRE_INVALID_HEX_DIGIT;
}
static int nextesc(const char** p) {
const unsigned char* s = (unsigned char*)(*p)++;
switch(*s) {
case 0:
return -SLRE_UNTERM_ESC_SEQ;
case 'c':
++*p;
return *s & 31;
case 'b':
return '\b';
case 't':
return '\t';
case 'n':
return '\n';
case 'v':
return '\v';
case 'f':
return '\f';
case 'r':
return '\r';
case '\\':
return '\\';
case 'u':
if(isxdigit(s[1]) && isxdigit(s[2]) && isxdigit(s[3]) && isxdigit(s[4])) {
(*p) += 4;
return hex(s[1]) << 12 | hex(s[2]) << 8 | hex(s[3]) << 4 | hex(s[4]);
}
return -SLRE_INVALID_HEX_DIGIT;
case 'x':
if(isxdigit(s[1]) && isxdigit(s[2])) {
(*p) += 2;
return (hex(s[1]) << 4) | hex(s[2]);
}
return -SLRE_INVALID_HEX_DIGIT;
default:
return -SLRE_INVALID_ESC_CHAR;
}
}
MJS_PRIVATE size_t unescape(const char* s, size_t len, char* to) {
const char* end = s + len;
size_t n = 0;
char tmp[4];
Rune r;
while(s < end) {
s += chartorune(&r, s);
if(r == '\\' && s < end) {
switch(*s) {
case '"':
s++, r = '"';
break;
case '\'':
s++, r = '\'';
break;
case '\n':
s++, r = '\n';
break;
default: {
const char* tmp_s = s;
int i = nextesc(&s);
switch(i) {
case -SLRE_INVALID_ESC_CHAR:
r = '\\';
s = tmp_s;
n += runetochar(to == NULL ? tmp : to + n, &r);
s += chartorune(&r, s);
break;
case -SLRE_INVALID_HEX_DIGIT:
default:
r = i;
}
}
}
}
n += runetochar(to == NULL ? tmp : to + n, &r);
}
return n;
}
MJS_PRIVATE void embed_string(
struct mbuf* m,
size_t offset,
const char* p,
size_t len,
uint8_t /*enum embstr_flags*/ flags) {
char* old_base = m->buf;
uint8_t p_backed_by_mbuf = p >= old_base && p < old_base + m->len;
size_t n = (flags & EMBSTR_UNESCAPE) ? unescape(p, len, NULL) : len;
/* Calculate how many bytes length takes */
size_t k = cs_varint_llen(n);
/* total length: varing length + string len + zero-term */
size_t tot_len = k + n + !!(flags & EMBSTR_ZERO_TERM);
/* Allocate buffer */
mbuf_insert(m, offset, NULL, tot_len);
/* Fixup p if it was relocated by mbuf_insert() above */
if(p_backed_by_mbuf) {
p += m->buf - old_base;
}
/* Write length */
cs_varint_encode(n, (unsigned char*)m->buf + offset, k);
/* Write string */
if(p != 0) {
if(flags & EMBSTR_UNESCAPE) {
unescape(p, len, m->buf + offset + k);
} else {
memcpy(m->buf + offset + k, p, len);
}
}
/* add NULL-terminator if needed */
if(flags & EMBSTR_ZERO_TERM) {
m->buf[offset + tot_len - 1] = '\0';
}
}