unleashed-firmware/lib/mjs/mjs_tok.c

/*
 * Copyright (c) 2017 Cesanta Software Limited
 * All rights reserved
 */

#include <stdlib.h>
#include <string.h>

#include "common/cs_dbg.h"
#include "mjs_tok.h"

MJS_PRIVATE void pinit(const char* file_name, const char* buf, struct pstate* p) {
    memset(p, 0, sizeof(*p));
    p->line_no = 1;
    p->last_emitted_line_no = 1;
    p->file_name = file_name;
    p->buf = p->pos = buf;
    mbuf_init(&p->offset_lineno_map, 0);
}

// We're not relying on the target libc ctype, as it may incorrectly
// handle negative arguments, e.g. isspace(-1).
static int mjs_is_space(int c) {
    return c == ' ' || c == '\r' || c == '\n' || c == '\t' || c == '\f' || c == '\v';
}

MJS_PRIVATE int mjs_is_digit(int c) {
    return c >= '0' && c <= '9';
}

static int mjs_is_alpha(int c) {
    return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
}

MJS_PRIVATE int mjs_is_ident(int c) {
    return c == '_' || c == '$' || mjs_is_alpha(c);
}

// Try to parse a token that can take one or two chars.
static int longtok(struct pstate* p, const char* first_chars, const char* second_chars) {
    if(strchr(first_chars, p->pos[0]) == NULL) return TOK_EOF;
    if(p->pos[1] != '\0' && strchr(second_chars, p->pos[1]) != NULL) {
        p->tok.len++;
        p->pos++;
        return p->pos[-1] << 8 | p->pos[0];
    }
    return p->pos[0];
}

// Try to parse a token that takes exactly 3 chars.
static int longtok3(struct pstate* p, char a, char b, char c) {
    if(p->pos[0] == a && p->pos[1] == b && p->pos[2] == c) {
        p->tok.len += 2;
        p->pos += 2;
        return p->pos[-2] << 16 | p->pos[-1] << 8 | p->pos[0];
    }
    return TOK_EOF;
}

// Try to parse a token that takes exactly 4 chars.
static int longtok4(struct pstate* p, char a, char b, char c, char d) {
    if(p->pos[0] == a && p->pos[1] == b && p->pos[2] == c && p->pos[3] == d) {
        p->tok.len += 3;
        p->pos += 3;
        return p->pos[-3] << 24 | p->pos[-2] << 16 | p->pos[-1] << 8 | p->pos[0];
    }
    return TOK_EOF;
}

static int getnum(struct pstate* p) {
    if(p->pos[0] == '0' && p->pos[1] == 'x') {
        // MSVC6 strtod cannot parse 0x... numbers, thus this ugly workaround.
        strtoul(p->pos + 2, (char**)&p->pos, 16);
    } else {
        strtod(p->pos, (char**)&p->pos);
    }
    p->tok.len = p->pos - p->tok.ptr;
    p->pos--;
    return TOK_NUM;
}

static int is_reserved_word_token(const char* s, int len) {
    const char* reserved[] = {"break",     "case",   "catch", "continue",   "debugger", "default",
                              "delete",    "do",     "else",  "false",      "finally",  "for",
                              "function",  "if",     "in",    "instanceof", "new",      "null",
                              "return",    "switch", "this",  "throw",      "true",     "try",
                              "typeof",    "var",    "void",  "while",      "with",     "let",
                              "undefined", NULL};
    int i;
    if(!mjs_is_alpha(s[0])) return 0;
    for(i = 0; reserved[i] != NULL; i++) {
        if(len == (int)strlen(reserved[i]) && strncmp(s, reserved[i], len) == 0) return i + 1;
    }
    return 0;
}

static int getident(struct pstate* p) {
    while(mjs_is_ident(p->pos[0]) || mjs_is_digit(p->pos[0])) p->pos++;
    p->tok.len = p->pos - p->tok.ptr;
    p->pos--;
    return TOK_IDENT;
}

static int getstr(struct pstate* p) {
    int quote = *p->pos++;
    p->tok.ptr++;
    while(p->pos[0] != '\0' && p->pos[0] != quote) {
        if(p->pos[0] == '\\' && p->pos[1] != '\0' &&
           (p->pos[1] == quote || strchr("bfnrtv\\", p->pos[1]) != NULL)) {
            p->pos += 2;
        } else {
            p->pos++;
        }
    }
    p->tok.len = p->pos - p->tok.ptr;
    return TOK_STR;
}

static void skip_spaces_and_comments(struct pstate* p) {
    const char* pos;
    do {
        pos = p->pos;
        while(mjs_is_space(p->pos[0])) {
            if(p->pos[0] == '\n') p->line_no++;
            p->pos++;
        }
        if(p->pos[0] == '/' && p->pos[1] == '/') {
            while(p->pos[0] != '\0' && p->pos[0] != '\n') p->pos++;
        }
        if(p->pos[0] == '/' && p->pos[1] == '*') {
            p->pos += 2;
            while(p->pos[0] != '\0') {
                if(p->pos[0] == '\n') p->line_no++;
                if(p->pos[0] == '*' && p->pos[1] == '/') {
                    p->pos += 2;
                    break;
                }
                p->pos++;
            }
        }
    } while(pos < p->pos);
}

static int ptranslate(int tok) {
#define DT(a, b) ((a) << 8 | (b))
#define TT(a, b, c) ((a) << 16 | (b) << 8 | (c))
#define QT(a, b, c, d) ((a) << 24 | (b) << 16 | (c) << 8 | (d))
    /* Map token ID produced by mjs_tok.c to token ID produced by lemon */
    /* clang-format off */
  switch (tok) {
    case ':': return TOK_COLON;
    case ';': return TOK_SEMICOLON;
    case ',': return TOK_COMMA;
    case '=': return TOK_ASSIGN;
    case '{': return TOK_OPEN_CURLY;
    case '}': return TOK_CLOSE_CURLY;
    case '(': return TOK_OPEN_PAREN;
    case ')': return TOK_CLOSE_PAREN;
    case '[': return TOK_OPEN_BRACKET;
    case ']': return TOK_CLOSE_BRACKET;
    case '*': return TOK_MUL;
    case '+': return TOK_PLUS;
    case '-': return TOK_MINUS;
    case '/': return TOK_DIV;
    case '%': return TOK_REM;
    case '&': return TOK_AND;
    case '|': return TOK_OR;
    case '^': return TOK_XOR;
    case '.': return TOK_DOT;
    case '?': return TOK_QUESTION;
    case '!': return TOK_NOT;
    case '~': return TOK_TILDA;
    case '<': return TOK_LT;
    case '>': return TOK_GT;
    case DT('<','<'): return TOK_LSHIFT;
    case DT('>','>'): return TOK_RSHIFT;
    case DT('-','-'): return TOK_MINUS_MINUS;
    case DT('+','+'): return TOK_PLUS_PLUS;
    case DT('+','='): return TOK_PLUS_ASSIGN;
    case DT('-','='): return TOK_MINUS_ASSIGN;
    case DT('*','='): return TOK_MUL_ASSIGN;
    case DT('/','='): return TOK_DIV_ASSIGN;
    case DT('&','='): return TOK_AND_ASSIGN;
    case DT('|','='): return TOK_OR_ASSIGN;
    case DT('%','='): return TOK_REM_ASSIGN;
    case DT('^','='): return TOK_XOR_ASSIGN;
    case DT('=','='): return TOK_EQ;
    case DT('!','='): return TOK_NE;
    case DT('<','='): return TOK_LE;
    case DT('>','='): return TOK_GE;
    case DT('&','&'): return TOK_LOGICAL_AND;
    case DT('|','|'): return TOK_LOGICAL_OR;
    case TT('=','=','='): return TOK_EQ_EQ;
    case TT('!','=','='): return TOK_NE_NE;
    case TT('<','<','='): return TOK_LSHIFT_ASSIGN;
    case TT('>','>','='): return TOK_RSHIFT_ASSIGN;
    case TT('>','>','>'): return TOK_URSHIFT;
    case QT('>','>','>','='): return TOK_URSHIFT_ASSIGN;
  }
    /* clang-format on */
    return tok;
}

MJS_PRIVATE int pnext(struct pstate* p) {
    int tmp, tok = TOK_INVALID;

    skip_spaces_and_comments(p);
    p->tok.ptr = p->pos;
    p->tok.len = 1;

    if(p->pos[0] == '\0') {
        tok = TOK_EOF;
    } else if(mjs_is_digit(p->pos[0])) {
        tok = getnum(p);
    } else if(p->pos[0] == '\'' || p->pos[0] == '"') {
        tok = getstr(p);
    } else if(mjs_is_ident(p->pos[0])) {
        tok = getident(p);
        /*
     * NOTE: getident() has side effects on `p`, and `is_reserved_word_token()`
     * relies on them. Since in C the order of evaluation of the operands is
     * undefined, `is_reserved_word_token()` should be called in a separate
     * statement.
     */
        tok += is_reserved_word_token(p->tok.ptr, p->tok.len);
    } else if(strchr(",.:;{}[]()?", p->pos[0]) != NULL) {
        tok = p->pos[0];
    } else if(
        (tmp = longtok3(p, '<', '<', '=')) != TOK_EOF ||
        (tmp = longtok3(p, '>', '>', '=')) != TOK_EOF ||
        (tmp = longtok4(p, '>', '>', '>', '=')) != TOK_EOF ||
        (tmp = longtok3(p, '>', '>', '>')) != TOK_EOF ||
        (tmp = longtok3(p, '=', '=', '=')) != TOK_EOF ||
        (tmp = longtok3(p, '!', '=', '=')) != TOK_EOF ||
        (tmp = longtok(p, "&", "&=")) != TOK_EOF || (tmp = longtok(p, "|", "|=")) != TOK_EOF ||
        (tmp = longtok(p, "<", "<=")) != TOK_EOF || (tmp = longtok(p, ">", ">=")) != TOK_EOF ||
        (tmp = longtok(p, "-", "-=")) != TOK_EOF || (tmp = longtok(p, "+", "+=")) != TOK_EOF) {
        tok = tmp;
    } else if((tmp = longtok(p, "^~+-%/*<>=!|&", "=")) != TOK_EOF) {
        tok = tmp;
    }
    if(p->pos[0] != '\0') p->pos++;
    LOG(LL_VERBOSE_DEBUG, ("  --> %d [%.*s]", tok, p->tok.len, p->tok.ptr));
    p->prev_tok = p->tok.tok;
    p->tok.tok = ptranslate(tok);
    return p->tok.tok;
}