1
1
mirror of https://github.com/kanaka/mal.git synced 2024-08-17 09:40:21 +03:00
mal/impls/zig/reader.zig
Joel Martin 8a19f60386 Move implementations into impls/ dir
- Reorder README to have implementation list after "learning tool"
  bullet.

- This also moves tests/ and libs/ into impls. It would be preferrable
  to have these directories at the top level.  However, this causes
  difficulties with the wasm implementations which need pre-open
  directories and have trouble with paths starting with "../../". So
  in lieu of that, symlink those directories to the top-level.

- Move the run_argv_test.sh script into the tests directory for
  general hygiene.
2020-02-10 23:50:16 -06:00

342 lines
10 KiB
Zig

const fmt = @import("std").fmt;
const warn = @import("std").debug.warn;
pub const pcre = @cImport({
@cInclude("pcre.h");
});
const MalType = @import("types.zig").MalType;
const MalData = @import("types.zig").MalData;
const MalTypeValue = @import("types.zig").MalTypeValue;
const MalError = @import("error.zig").MalError;
const MalLinkedList = @import("linked_list.zig").MalLinkedList;
const printer = @import("printer.zig");
const Allocator = @import("std").heap.c_allocator;
const string_eql = @import("utils.zig").string_eql;
const linked_list = @import("linked_list.zig");
const match: [*]const u8 =
c\\[\s,]*(~@|[\[\]{}()'`~^@]|"(?:\\.|[^\\"])*"?|;.*|[^\s\[\]{}('"`,;)]*)
;
var error_msg: [*c]const u8 = undefined;
var erroroffset: c_int = 0;
var re: ?*pcre.pcre = null;
const Reader = struct {
position: u32,
string: [] const u8,
tokens: [] usize,
pub fn init(string: [] const u8, tokens: [] usize) Reader {
return Reader {
.position = 0,
.string = string,
.tokens = tokens,
};
}
pub fn next(self: *Reader) []const u8 {
const this_token = self.peek();
self.position += 1;
return this_token;
}
pub fn peek(self: *Reader) []const u8 {
while(!self.eol()) {
const start = self.tokens[2*self.position];
const end = self.tokens[2*self.position+1];
if(self.string[start] == ';') {
self.position += 1;
continue;
}
return self.string[start..end];
}
return "";
}
pub fn eol(self: *Reader) bool {
return (2 * self.position >= self.tokens.len);
}
};
const AliasPair = struct {
name: []const u8,
value: []const u8,
count: u8,
};
const alias_pairs = [_] AliasPair {
AliasPair {.name="@", .value="deref", .count=1},
AliasPair {.name="\'", .value="quote", .count=1},
AliasPair {.name="`", .value="quasiquote", .count=1},
AliasPair {.name="~", .value="unquote", .count=1},
AliasPair {.name="~@", .value="splice-unquote", .count=1},
AliasPair {.name="^", .value="with-meta", .count=2},
};
pub fn read_form(reader: *Reader) MalError!?*MalType {
if(reader.eol()) {
return null;
}
const token = reader.peek();
if(token.len == 0) {
return MalType.new_nil(Allocator);
}
if(token[0] == '(') {
return try read_list(reader);
}
else if(token[0] == '[') {
return try read_vector(reader);
}
else if(token[0] == ':') {
const keyword = reader.next();
return MalType.new_keyword(Allocator, keyword[1..keyword.len]);
}
else if(token[0] == '{') {
return try read_hashmap(reader);
}
for(alias_pairs) |pair| {
const name = pair.name;
const value = pair.value;
const count = pair.count;
if(!string_eql(token, name)) {
continue;
}
var new_ll = MalLinkedList.init(Allocator);
const new_generic = try MalType.new_generic(Allocator, value);
const tmp = reader.next();
var num_read: u8 = 0;
while(num_read < count) {
const next_read = (try read_form(reader)) orelse return MalError.ArgError;
try linked_list.prepend_mal(Allocator, &new_ll, next_read);
num_read += 1;
}
try linked_list.prepend_mal(Allocator, &new_ll, new_generic);
const new_list = try MalType.new_nil(Allocator);
new_list.data = MalData {.List = new_ll};
return new_list;
}
return try read_atom(reader);
}
pub fn read_list(reader: *Reader) MalError!*MalType {
const first_token = reader.next();
var new_ll = MalLinkedList.init(Allocator);
const mal_list: *MalType = try MalType.new_nil(Allocator);
while(!reader.eol()) {
var next_token = reader.peek();
if(next_token.len == 0) {
return MalError.ReaderUnmatchedParen;
}
if(next_token[0] == ')') {
const right_paren = reader.next();
mal_list.data = MalData{.List = new_ll};
return mal_list;
}
const mal = (try read_form(reader)) orelse return MalError.ArgError;
try linked_list.append_mal(Allocator, &new_ll, mal);
}
return MalError.ReaderUnmatchedParen;
}
pub fn read_vector(reader: *Reader) MalError!*MalType {
const first_token = reader.next();
var new_ll = MalLinkedList.init(Allocator);
const mal_list: *MalType = try MalType.new_nil(Allocator);
while(!reader.eol()) {
var next_token = reader.peek();
if(next_token.len == 0) {
return MalError.ReaderUnmatchedParen;
}
if(next_token[0] == ']') {
const right_paren = reader.next();
mal_list.data = MalData{.Vector = new_ll};
return mal_list;
}
const mal = (try read_form(reader)) orelse return MalError.ArgError;
try linked_list.append_mal(Allocator, &new_ll, mal);
}
return MalError.ReaderUnmatchedParen;
}
pub fn read_hashmap(reader: *Reader) MalError!*MalType {
const first_token = reader.next();
const new_hashmap = try MalType.new_hashmap(Allocator);
while(!reader.eol()) {
var next_token = reader.peek();
if(next_token.len == 0) {
return MalError.ReaderUnmatchedParen;
}
if(next_token[0] == '}') {
const right_paren = reader.next();
return new_hashmap;
}
const mal = (try read_form(reader)) orelse return MalError.ArgError;
const key = switch(mal.data) {
.String => |s| s,
.Keyword => |kwd| kwd,
else => return MalError.TypeError,
};
if(next_token.len == 0 or next_token[0] == '}') {
return MalError.ReaderBadHashmap;
}
const val = (try read_form(reader)) orelse return MalError.ArgError;
try new_hashmap.hashmap_insert(key, val);
}
return MalError.ReaderUnmatchedParen;
}
fn char_is_int(c: u8) bool {
return (c >= '0' and c <= '9');
}
fn token_is_int(token: []const u8) bool {
if(char_is_int(token[0]))
return true;
if(token.len >= 2 and token[0] == '-' and char_is_int(token[1]))
return true;
return false;
}
pub fn read_atom(reader: *Reader) MalError!*MalType {
const token = reader.next();
if(token_is_int(token)) {
var mal_atom = try MalType.new_nil(Allocator);
try read_atom_int(mal_atom, token);
return mal_atom;
}
else if(string_eql(token, "nil")) {
return MalType.new_nil(Allocator);
}
else if(string_eql(token, "true")) {
return MalType.new_bool(Allocator, true);
}
else if(string_eql(token, "false")) {
return MalType.new_bool(Allocator, false);
}
else if(token[0] == '"') {
var mal_atom = try MalType.new_nil(Allocator);
try read_atom_string(mal_atom, token);
return mal_atom;
}
else {
var mal_atom = try MalType.new_generic(Allocator, token);
return mal_atom;
}
}
fn read_atom_int(mal_atom: *MalType, token: []const u8) MalError!void {
// TODO: extract int type from union
mal_atom.data = MalData {.Int = fmt.parseInt(i32, token, 10)
catch |err| return MalError.SystemError };
}
fn read_atom_string(mal_atom: *MalType, token: []const u8) MalError!void {
const n = token.len;
if(token[0] != '"' or token[n-1] != '"' or n <= 1) {
return MalError.ReaderUnmatchedString;
}
if(n <= 2) {
// We get here when the token is an empty string.
// We encode this as MalTypeValue.String, with null .string_value
var string = Allocator.alloc(u8, 0) catch return MalError.SystemError;
mal_atom.data = MalData {.String = string};
return;
}
var tmp_buffer = Allocator.alloc(u8, n-2) catch return MalError.SystemError;
defer Allocator.free(tmp_buffer);
var i: usize = 1;
var j: usize = 0;
const escape_char: u8 = '\\'; //TODO: remove this comment required by bad emacs config '
while(i < n-1) {
if(token[i] != escape_char) {
tmp_buffer[j] = token[i];
j += 1;
i += 1;
}
else {
if(i==n-2) {
return MalError.ReaderUnmatchedString;
}
if(token[i+1] == 'n') {
tmp_buffer[j] = '\n';
} else {
tmp_buffer[j] = token[i+1];
}
j += 1;
i += 2;
}
}
var string = Allocator.alloc(u8, j) catch return MalError.SystemError;
i = 0;
while(i < j) {
string[i] = tmp_buffer[i];
i += 1;
}
mal_atom.data = MalData {.String = string};
}
pub fn read_str(string: [] const u8) MalError!Reader {
if(re == null) {
re = pcre.pcre_compile(&match[0], 0, &error_msg, &erroroffset, 0);
}
const tokens = try tokenize(re, string);
return Reader.init(string, tokens);
}
// Allocates an array of matches. Caller is becomes owner of memory.
pub fn tokenize(regex: ?*pcre.pcre, string: [] const u8) MalError![] usize {
// TODO: pass in allocator
const buffer_size: usize = 3 * string.len + 10;
var indices: [] c_int = Allocator.alloc(c_int, buffer_size)
catch return MalError.SystemError;
defer Allocator.free(indices);
var match_buffer: [] usize = Allocator.alloc(usize, buffer_size)
catch return MalError.SystemError;
defer Allocator.free(match_buffer);
var current_match: usize = 0;
var start_pos: c_int = 0;
var rc: c_int = 0;
var start_match: usize = 0;
var end_match: usize = 0;
const subject_size: c_int = @intCast(c_int, string.len);
while(start_pos < subject_size) {
rc = pcre.pcre_exec(regex, 0, &string[0], subject_size, start_pos, 0,
&indices[0], @intCast(c_int,buffer_size));
if(rc <= 0)
break;
start_pos = indices[1];
start_match = @intCast(usize, indices[2]);
end_match = @intCast(usize, indices[3]);
match_buffer[current_match] = start_match;
match_buffer[current_match+1] = end_match;
current_match += 2;
}
var matches: [] usize = Allocator.alloc(usize, current_match)
catch return MalError.SystemError;
var i: usize = 0;
while(i < current_match) {
matches[i] = match_buffer[i];
i += 1;
}
return matches;
}