1
1
mirror of https://github.com/kanaka/mal.git synced 2024-09-20 18:18:51 +03:00
mal/impls/c.2/reader.c
2021-05-12 09:32:29 -05:00

664 lines
16 KiB
C

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <gc.h>
#include "reader.h"
#define TOKEN_SPECIAL_CHARACTER 1
#define TOKEN_STRING 2
#define TOKEN_INTEGER 3
#define TOKEN_FLOAT 4
#define TOKEN_SYMBOL 5
#define TOKEN_COMMENT 6
#define TOKEN_KEYWORD 7
#define TOKEN_TRUE 8
#define TOKEN_FALSE 9
#define TOKEN_NIL 10
#define SYMBOL_NIL "nil"
#define SYMBOL_TRUE "true"
#define SYMBOL_FALSE "false"
#define SYMBOL_QUOTE "quote"
#define SYMBOL_QUASIQUOTE "quasiquote"
#define SYMBOL_UNQUOTE "unquote"
#define SYMBOL_SPLICE_UNQUOTE "splice-unquote"
#define SYMBOL_DEREF "deref"
#define SYMBOL_WITH_META "with-meta"
Reader* reader_make(long token_capacity) {
Reader* reader = GC_MALLOC(sizeof(*reader));
reader->max_tokens = token_capacity;
reader->position = 0;
reader->token_count = 0;
reader->token_data = GC_MALLOC(sizeof(Token*) * token_capacity);
reader->error = NULL;
return reader;
}
Reader* reader_append(Reader* reader, Token* token) {
if (reader->token_count < reader->max_tokens) {
reader->token_data[reader->token_count] = token;
reader->token_count++;
}
else {
/* TODO: expand the storage more intelligently */
reader->max_tokens *= 2;
reader = GC_REALLOC(reader, sizeof(*reader) * reader->max_tokens);
reader->token_data[reader->token_count] = token;
reader->token_count++;
}
return reader;
}
Token* reader_peek(const Reader* reader) {
return (reader->token_data[reader->position]);
}
Token* reader_next(Reader* reader) {
Token* tok = reader->token_data[reader->position];
if (reader->position == -1) {
return NULL;
}
else if (reader->position < reader->token_count) {
(reader->position)++;
return tok;
}
else {
reader->position = -1;
return tok;
}
}
void reader_print(Reader* reader) {
/* NOTE: needed for debugging the reader only */
Token* tok;
for (long i = 0; i < reader->token_count; i++) {
tok = reader_next(reader);
switch (tok->type) {
case TOKEN_SPECIAL_CHARACTER:
printf("special character: %s", tok->data);
break;
case TOKEN_STRING:
printf("string: %s", tok->data);
break;
case TOKEN_INTEGER:
printf("integer: %s", tok->data);
break;
case TOKEN_FLOAT:
printf("float: %s", tok->data);
break;
case TOKEN_SYMBOL:
printf("symbol: %s", tok->data);
break;
case TOKEN_COMMENT:
printf("comment: \"%s\"", tok->data);
break;
case TOKEN_KEYWORD:
printf("keyword: %s", tok->data);
break;
case TOKEN_TRUE:
printf("true: %s", tok->data);
break;
case TOKEN_FALSE:
printf("false: %s", tok->data);
break;
case TOKEN_NIL:
printf("nil: %s", tok->data);
break;
}
/* print an error for any tokens with an error string */
tok->error ? printf(" - %s", tok->error) : 0;
}
}
MalType* read_str(char* token_string) {
Reader* reader = tokenize(token_string);
if (reader->error) {
return make_error_fmt("Reader error: %s", reader->error);
}
else if (reader->token_count == 0) {
return make_nil();
}
else {
return read_form(reader);
}
}
Reader* tokenize(char* token_string) {
/* allocate enough space for a Reader */
/* TODO: over-allocates space */
Reader* reader = reader_make(strlen(token_string));
for (char* next = token_string; *next != '\0';) {
Token* token = NULL;
switch (*next) {
/* skip whitespace */
case ' ':
case ',':
case 0x0A: /* newline */
next++;
token = NULL; /* no token for whitespace */
break;
/* single character token */
case '[':
case '\\':
case ']':
case '{':
case '}':
case '(':
case ')':
case '\'':
case '@':
case '`':
case '^':
next = read_fixed_length_token(next, &token, 1);
break;
/* single or double character token */
case '~':
if ( *(next + 1) == '@' ) {
next = read_fixed_length_token(next, &token, 2);
}
else {
next = read_fixed_length_token(next, &token, 1);
}
break;
/* read string of characters within double quotes */
case '"':
next = read_string_token(next, &token);
break;
/* read a comment - all remaining input until newline */
case ';':
next = read_comment_token(next, &token);
token = NULL; /* skip token for comments */
break;
/* read an integer */
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
next = read_number_token(next, &token);
// next = read_integer_token(next, &token);
break;
/* integer may be prefixed with +/- */
case '+':
case '-':
if (isdigit(next[1])) {
next = read_number_token(next, &token);
// next = read_integer_token(next, &token);
}
else { /* if not digits it is part of a symbol */
next = read_symbol_token(next, &token);
}
break;
/* read keyword */
case ':':
next = read_keyword_token(next, &token);
break;
/* read anything else as a symbol */
default:
next = read_symbol_token(next, &token);
break;
}
if (!token) {
/* if no token was read (whitespace or comments)
continue the loop */
continue;
}
else {
if (token->error) {
/* report any errors with an early return */
reader = reader_append(reader, token);
reader->error = token->error;
return reader;
}
/* otherwise append the token and continue */
reader = reader_append(reader, token);
}
}
return reader;
}
char* read_fixed_length_token(char* current, Token** ptoken, int n) {
*ptoken = token_allocate(current, n, TOKEN_SPECIAL_CHARACTER, NULL);
return (current + n);
}
char* read_terminated_token (char* current, Token** ptoken, int token_type) {
static char* const terminating_characters = " ,[](){};\n";
/* search for first terminating character */
char* end = strpbrk(current, terminating_characters);
/* if terminating character is not found it implies the end of the string */
long token_length = !end ? strlen(current) : (end - current);
/* next token starts with the terminating character */
*ptoken = token_allocate(current, token_length, token_type, NULL);
return (current + token_length);
}
char* read_symbol_token (char* current, Token** ptoken) {
char* next = read_terminated_token(current, ptoken, TOKEN_SYMBOL);
/* check for reserved symbols */
if (strcmp((*ptoken)->data, SYMBOL_NIL) == 0) {
(*ptoken)->type = TOKEN_NIL;
}
else if (strcmp((*ptoken)->data, SYMBOL_TRUE) == 0) {
(*ptoken)->type = TOKEN_TRUE;
}
else if (strcmp((*ptoken)->data, SYMBOL_FALSE) == 0) {
(*ptoken)->type = TOKEN_FALSE;
}
/* TODO: check for invalid characters */
return next;
}
char* read_keyword_token (char* current, Token** ptoken) {
/* TODO: check for invalid characters */
return read_terminated_token(current + 1, ptoken, TOKEN_KEYWORD);
}
char* read_number_token(char* current, Token** ptoken) {
int has_decimal_point = 0;
char* next = read_terminated_token(current, ptoken, TOKEN_INTEGER);
long token_length = next - current;
/* first char is either digit or '+' or '-'
check the rest consists of valid characters */
for (long i = 1; i < token_length; i++) {
if ((*ptoken)->data[i] == '.' && has_decimal_point) {
(*ptoken)->error = "Invalid character reading number";
break;
}
else if ((*ptoken)->data[i] == '.' && !has_decimal_point) {
has_decimal_point = 1;
(*ptoken)->type = TOKEN_FLOAT;
break;
}
else if (!(isdigit((*ptoken)->data[i]))) {
(*ptoken)->error = "Invalid character reading number";
break;
}
}
return next;
}
char* read_string_token(char* current, Token** ptoken) {
char *start, *end, *error = NULL;
long token_length = 0;
start = current + 1;
while(1) {
end = strchr(start, '"'); /* find the next " character */
/* handle failure to find closing quotes - implies end of input has been reached */
if (!end) {
end = current + strlen(current);
token_length = strlen(current);
error = "EOF reached with unterminated string";
break;
}
/* if the character preceding the " is a '\' character (escape), need to check if it is escaping the " and if it
is then keep searching from the next character */
else if ( *(end - 1) == '\\') {
char* back_ptr = end - 1;
while (*back_ptr == '\\') {
back_ptr--; /* back up to count the escape characters '\' */
}
long escape_chars = (end - 1) - back_ptr;
if (escape_chars % 2 == 1) { /* odd number of '\' chars means " is not quoted */
start = end + 1; /* so keep searching */
} else {
/* even number of '\' characters means we found the terminating quote mark */
token_length = (end - current - 1); /* quotes are excluded from string token */
break;
}
}
else {
token_length = (end - current - 1); /* quotes are excluded from string token */
break;
}
}
char* unescaped_string = unescape_string(current + 1, token_length);
*ptoken = token_allocate(unescaped_string, strlen(unescaped_string), TOKEN_STRING, error);
return (end + 1);
}
char* read_comment_token(char* current, Token** ptoken) {
/* comment includes all remaining characters to the next newline */
/* search for newline character */
char* end = strchr(current, 0x0A);
/* if newline is not found it implies the end of string is reached */
long token_chars = !end ? strlen(current) : (end - current);
*ptoken = token_allocate(current, token_chars, TOKEN_COMMENT, NULL);
return (current + token_chars + 1); /* next token starts with the char after the newline */
}
MalType* read_form(Reader* reader) {
if (reader->token_count > 0) {
Token* tok = reader_peek(reader);
if (tok->type == TOKEN_SPECIAL_CHARACTER) {
switch(tok->data[0]) {
case '(':
return read_list(reader);
break;
case '[':
return read_vector(reader);
break;
case '{':
return read_hashmap(reader);
break;
case '\'':
/* create and return a MalType list (quote read_form) */
return make_symbol_list(reader, SYMBOL_QUOTE);
break;
case '`':
/* create and return a MalType list (quasiquote read_form) */
return make_symbol_list(reader, SYMBOL_QUASIQUOTE);
break;
case '~':
if (tok->data[1] == '@') {
/* create and return a MalType list (splice-unquote read_form) */
return make_symbol_list(reader, SYMBOL_SPLICE_UNQUOTE);
}
else {
/* create and return a MalType list (unquote read_form) */
return make_symbol_list(reader, SYMBOL_UNQUOTE);
}
case '@':
/* create and return a MalType list (deref read_form) */
return make_symbol_list(reader, SYMBOL_DEREF);
case '^':
/* create and return a MalType list (with-meta <second-form> <first-form>
where first form should ne a metadata map and second form is somethingh
that can have metadata attached */
reader_next(reader);
/* grab the components of the list */
MalType* symbol = make_symbol(SYMBOL_WITH_META);
MalType* first_form = read_form(reader);
MalType* second_form = read_form(reader);
/* push the symbol and the following forms onto a list */
list lst = NULL;
lst = list_push(lst, symbol);
lst = list_push(lst, second_form);
lst = list_push(lst, first_form);
lst = list_reverse(lst);
return make_list(lst);
default:
/* shouldn't happen */
return make_error_fmt("Reader error: Unknown special character '%c'", tok->data[0]);
}
} else { /* Not a special character */
return read_atom(reader);
}
}
else { /* no tokens */
return NULL;
}
}
MalType* read_list(Reader* reader) {
MalType* retval = read_matched_delimiters(reader, '(', ')' );
if (is_error(retval)) {
retval = make_error("Reader error: unbalanced parenthesis '()'");
}
else {
retval->type = MALTYPE_LIST;
}
return retval;
}
MalType* read_vector(Reader* reader) {
MalType* retval = read_matched_delimiters(reader, '[', ']' );
if (is_error(retval)) {
retval = make_error("Reader error: unbalanced brackets '[]'");
}
else {
retval->type = MALTYPE_VECTOR;
}
return retval;
}
MalType* read_hashmap(Reader* reader) {
MalType* retval = read_matched_delimiters(reader, '{', '}' );
if (is_error(retval)) {
retval = make_error("Reader error: unbalanced braces '{}'");
}
else if (list_count(retval->value.mal_list)%2 != 0) {
retval = make_error("Reader error: missing value in map literal");
}
else {
retval->type = MALTYPE_HASHMAP;
}
return retval;
}
MalType* read_matched_delimiters(Reader* reader, char start_delimiter, char end_delimiter) {
/* TODO: separate implementation of hashmap and vector */
Token* tok = reader_next(reader);
list lst = NULL;
if (reader_peek(reader)->data[0] == end_delimiter) {
reader_next(reader);
return make_list(NULL);
}
else {
while (tok->data[0] != end_delimiter) {
MalType* val = read_form(reader);
lst = list_push(lst, (gptr)val);
tok = reader_peek(reader);
if (!tok) {
/* unbalanced parentheses */
return make_error("");
}
}
reader_next(reader);
return make_list(list_reverse(lst));
}
}
MalType* read_atom(Reader* reader) {
Token* tok = reader_next(reader);
switch (tok->type) {
case TOKEN_SPECIAL_CHARACTER:
return make_symbol(tok->data);
break;
case TOKEN_COMMENT:
return make_error("Error: comment found in token strea");
break;
case TOKEN_STRING:
return make_string(tok->data);
break;
case TOKEN_INTEGER:
return make_integer(strtol(tok->data, NULL, 10));
break;
case TOKEN_FLOAT:
return make_float(atof(tok->data));
break;
case TOKEN_SYMBOL:
return make_symbol(tok->data);
break;
case TOKEN_KEYWORD:
return make_keyword(tok->data);
break;
case TOKEN_TRUE:
return make_true();
break;
case TOKEN_FALSE:
return make_false();
break;
case TOKEN_NIL:
return make_nil();
break;
}
return make_error("Reader error: Unknown atom type");
}
MalType* make_symbol_list(Reader* reader, char* symbol_name) {
reader_next(reader);
list lst = NULL;
/* push the symbol and the following form onto the list */
lst = list_push(lst, make_symbol(symbol_name));
lst = list_push(lst, read_form(reader));
return make_list(list_reverse(lst));
}
Token* token_allocate(char* str, long num_chars, int type, char* error) {
/* allocate space for the string */
char* data = GC_MALLOC(sizeof(*data) * num_chars + 1); /* include space for null byte */
strncpy (data, str, num_chars); /* copy num_chars characters into data */
data[num_chars] = '\0'; /* manually add the null byte */
/* allocate space for the token struct */
Token* token = GC_MALLOC(sizeof(*token));
token->data = data;
token->type = type;
token->error = error;
return token;
}
char* unescape_string(char* str, long length) {
char* dest = GC_MALLOC(sizeof(*dest)*length + 1);
long j = 0;
for (long i = 0; i < length; i++) {
/* look for the quoting character */
if (str[i] == '\\') {
switch (str[i+1]) {
/* replace '\"' with normal '"' */
case '"':
dest[j++]='"';
i++; /* skip extra char */
break;
/* replace '\n' with newline 0x0A */
case 'n':
dest[j++]= 0x0A;
i++; /* skip extra char */
break;
/* replace '\\' with '\' */
case '\\':
dest[j++]= '\\';
i++; /* skip extra char */
break;
default:
/* just a '\' symbol so copy it */
dest[j++]='\\';
}
}
/* not a quote so copy it */
else {
dest[j++] = str[i];
}
}
dest[j] = '\0';
return dest;
}