keithlisp/parse.c

#include "parse.h"

#define lisp_tokenize_len 64
char lisp_tokenize_buf[lisp_tokenize_len];
int lisp_tokenize_pos;
bool lisp_tokenize_quote_mode;
bool lisp_tokenize_esc_mode;
void lisp_tokenize_init() {
	lisp_tokenize_buf[0] = '\0';
	lisp_tokenize_pos = 0;
	lisp_tokenize_quote_mode = false;
	lisp_tokenize_esc_mode = false;
}
void lisp_tokenize_next(lisp_cons** head) {
	lisp_tokenize_buf[lisp_tokenize_pos] = '\0';
	lisp_string* token_lstr = lisp_string_create(lisp_tokenize_buf);
	lisp_tokenize_buf[0] = '\0';
	lisp_tokenize_pos = 0;

	lisp_cons* tail = dbg_malloc(sizeof(lisp_cons));
	tail->car.type = LISP_T_STRING;
	tail->car.value.string = token_lstr;
	tail->cdr.type = LISP_T_CONS;
	tail->cdr.value.cons = NULL;
	if (*head != NULL)
		lisp_cons_join(*head, tail);
	else
		*head = tail;
}
void lisp_tokenize(char* buf, int len, lisp_cons** head) {
	for (int i = 0; i < len; i++) {
		if (lisp_tokenize_pos >= lisp_tokenize_len-1) {
			lisp_tokenize_pos = lisp_tokenize_len-1;
			lisp_tokenize_next(head);
		}
		char c = buf[i];
		if (lisp_tokenize_quote_mode) {
			lisp_tokenize_buf[lisp_tokenize_pos++] = c;
			if (c == '\\')
				lisp_tokenize_esc_mode = true;
			else if ((c == '"') & !lisp_tokenize_esc_mode) { // end of string
				lisp_tokenize_next(head);
				lisp_tokenize_quote_mode = false;
			} else
				lisp_tokenize_esc_mode = false;
			continue;
		}

		if (lisp_tokenize_esc_mode) {
			lisp_tokenize_buf[lisp_tokenize_pos++] = c;
			lisp_tokenize_esc_mode = false;
			continue;
		}

		if ((c == ' ') || (c == '\t') || (c == '\n') || (c == '\r') || (c == '\v') || (c == '\f')) {
			if (lisp_tokenize_pos > 0) // whitespace, end of token
				lisp_tokenize_next(head);
		} else if (c == ')') {
			// end current token
			if (lisp_tokenize_pos > 0)
				lisp_tokenize_next(head);
			// ) is its own token
			lisp_tokenize_buf[lisp_tokenize_pos++] = c;
			lisp_tokenize_next(head);
		} else if (c == '(') {
			// ( is its own token
			lisp_tokenize_buf[lisp_tokenize_pos++] = c;
			lisp_tokenize_next(head);
		} else if ((c == '"') && (lisp_tokenize_pos == 0)) {
			// enable quotes mode
			lisp_tokenize_buf[lisp_tokenize_pos++] = c;
			lisp_tokenize_quote_mode = true;
		} else if (c == '\\') {
			lisp_tokenize_esc_mode = true;
		} else {
			lisp_tokenize_buf[lisp_tokenize_pos++] = c;
		}
	}
}

bool lisp_parse_number(lisp_string* token_lstr, lisp_value* value) {
	int token_len = lisp_string_len(token_lstr);
	char* token_data = lisp_string_data(token_lstr);

	bool neg = false;
	bool exp_neg = false;
	bool exp_mode = false;
	bool point = false;
	bool valid = false;
	long _int = 0;
	float _float = 0.0f;
	float weight = 0.1f;
	int exp_i = 0;
	int exp = 0;
	int base = 10;
	for (int i = 0; i < token_len; i++) {
		char c = token_data[i];
		if (c >= 'a')
			c = c - ('a' - 'A');
		if (!exp_mode) {
			if (c == '-' && i == 0) {
				neg = true;
			} else if (c >= '0' && c <= '9') {
				valid = true;
				if (!point)
					_int = (_int * base) + (c - '0');
				else {
					_float += ((c - '0') * weight);
					weight /= 10.0f;
				}
			} else if (c >= 'A' && c <= 'F' && base == 16) {
				valid = true;
				_int = (_int * base) + (c - ('A' - 10));
			} else if (c == 'X') {
				base = 16;
			} else if (c == '.' && !point) {
				point = true;
				_float = (float) _int;
			} else if (c == 'E') {
				valid = false;
				exp_mode = true;
				if (!point)
					_float = (float) _int;
				point = true;
				exp_i = i+1;
			} else {
				valid = false;
				break;
			}
		} else {
			if (c == '-' && i == exp_i) {
				exp_neg = true;
			} else if (c >= '0' && c <= '9') {
				valid = true;
				exp = (exp * 10) + (c - '0');
			} else {
				valid = false;
				break;
			}
		}
	}
	if (valid && !point) {
		if (neg)
			_int = -_int;
		value->type = LISP_T_INT;
		value->value._int = _int;
		return true;
	} else if (valid) {
		if (neg)
			_float = -_float;
		if (exp_neg)
			exp = -exp;
		value->type = LISP_T_FLOAT;
		value->value._float = _float;
		if (exp_mode)
			value->value._float *= powf(10, exp);
		return true;
	}
	return false;
}
bool lisp_parse_string(lisp_string* token_lstr, lisp_value* value) {
	int token_len = lisp_string_len(token_lstr);
	char* token_data = lisp_string_data(token_lstr);

	if (token_data[0] != '"')
		return false;

	lisp_string* lstr = lisp_string_alloc(lisp_string_len(token_lstr) - 2);
	int lstr_len = 0;
	char* lstr_data = lisp_string_data(lstr);

	bool esc_mode = false;
	char esc_hex_mode = 0;
	char esc_hex = 0;
	for (int i = 1; i < token_len-1; i++) {
		char c = token_data[i];
		if (esc_mode) {
			esc_mode = false;
			switch (c) {
				case 'e':
					lstr_data[lstr_len++] = '\e';
					break;
				case 'r':
					lstr_data[lstr_len++] = '\r';
					break;
				case 'n':
					lstr_data[lstr_len++] = '\n';
					break;
				case 't':
					lstr_data[lstr_len++] = '\t';
					break;
				case 'f':
					lstr_data[lstr_len++] = '\f';
					break;
				case 'v':
					lstr_data[lstr_len++] = '\v';
					break;
				case 'x':
					esc_hex_mode = 2;
					esc_hex = 0;
					break;
				default:
					lstr_data[lstr_len++] = c;
					break;
			}
		} else if (esc_hex_mode > 0) {
			esc_hex_mode--;
			esc_hex <<= 4;
			if (c <= '9')
				esc_hex += (c - '0');
			else if (c <= 'F')
				esc_hex += (c - ('A'-10));
			else
				esc_hex += (c - ('a'-10));

			if (esc_hex_mode == 0)
				lstr_data[lstr_len++] = esc_hex;
		} else {
			if (c == '\\') {
				esc_mode = true;
			} else {
				lstr_data[lstr_len++] = c;
			}
		}
	}
	*((int*) lstr) = lstr_len;
	value->type = LISP_T_STRING;
	value->value.string = lstr;
	return true;
}
bool lisp_parse_scalar(lisp_string* token_lstr, lisp_value* value) {
	char* buf = lisp_string_data(token_lstr);
	if (buf[0] == '(' || buf[0] == ')' || buf[0] == '\'') // parentheses, quote
		return false;
	if (lisp_string_len(token_lstr) == 3 && buf[0] == 'n' && buf[1] == 'i' && buf[2] == 'l') { // nil
		value->type = LISP_T_CONS;
		value->value.cons = NULL;
		return true;
	}

	if (lisp_parse_string(token_lstr, value))
		return true;
	if (lisp_parse_number(token_lstr, value))
		return true;
	lisp_atom atom = lisp_atomize(token_lstr);
	value->type = LISP_T_ATOM;
	value->value.atom = atom;
	return true;
}

bool lisp_parse_recursive(lisp_cons** tokens, bool (*fetch_tokens)(lisp_cons**), lisp_value* value) {
	while (*tokens == NULL) // fetch more tokens
		if ((*fetch_tokens)(tokens) == false)
			return false;
	lisp_cons* cons = lisp_cons_pop(tokens);
	lisp_string* token_lstr = cons->car.value.string;
	char* buf = lisp_string_data(token_lstr);
	if (buf[0] == '\'') {
		lisp_string* new_lstr = lisp_string_create_raw(lisp_string_data(token_lstr)+1, lisp_string_len(token_lstr)-1);
		dbg_free(token_lstr);
		cons->car.value.string = new_lstr;
		*tokens = cons;
		lisp_cons* quote_cons = dbg_malloc(sizeof(lisp_cons));
		quote_cons->car.type = LISP_T_ATOM;
		quote_cons->car.value.atom = atom_quote;
		quote_cons->cdr.type = LISP_T_CONS;
		quote_cons->cdr.value.cons = dbg_malloc(sizeof(lisp_cons));
		quote_cons->cdr.value.cons->cdr.type = LISP_T_CONS;
		quote_cons->cdr.value.cons->cdr.value.cons = NULL;
		if (!lisp_parse_recursive(tokens, fetch_tokens, &quote_cons->cdr.value.cons->car))
			return false;
		value->type = LISP_T_CONS;
		value->value.cons = quote_cons;
		return true;
	}
	if (buf[0] == '(') {
		buf = NULL;
		dbg_free(token_lstr);
		lisp_cons* head = NULL;
		lisp_cons* tail = NULL;
		lisp_cons* new_cons = cons;
		bool cdr_mode = false;
		while (lisp_parse_recursive(tokens, fetch_tokens, &new_cons->car)) {
			new_cons->cdr.type = LISP_T_CONS;
			new_cons->cdr.value.cons = NULL;
			if (head == NULL)
				head = new_cons;
			if (tail == NULL)
				tail = head;
			else {
				if (cdr_mode) {
					tail->cdr = new_cons->car;
					dbg_free(new_cons);
				} else {
					tail->cdr.value.cons = new_cons;
					tail = new_cons;
				}
			}
			new_cons = dbg_malloc(sizeof(lisp_cons));

			while (*tokens == NULL) // fetch more tokens
				if ((*fetch_tokens)(tokens) == false)
					return false;
			token_lstr = (*tokens)->car.value.string;
			if (lisp_string_len(token_lstr) == 1) {
				if (lisp_string_data(token_lstr)[0] == ')') {
					cons = lisp_cons_pop(tokens);
					dbg_free(cons);
					dbg_free(token_lstr);
					break;
				} else if (lisp_string_data(token_lstr)[0] == '.') {
					cons = lisp_cons_pop(tokens);
					dbg_free(cons);
					dbg_free(token_lstr);
					cdr_mode = true;
				}
			}
		}
		dbg_free(new_cons);
		value->type = LISP_T_CONS;
		value->value.cons = head;

		return true;
	}
	if (lisp_parse_scalar(token_lstr, value)) {
		dbg_free(cons);
		dbg_free(token_lstr);
		return true;
	}
	*tokens = cons;
	return false;
}
oopsie i forgot to commit :zany: 2021-09-22 13:54:29 +00:00			`#include "parse.h"`

			`#define lisp_tokenize_len 64`
			`char lisp_tokenize_buf[lisp_tokenize_len];`
			`int lisp_tokenize_pos;`
			`bool lisp_tokenize_quote_mode;`
			`bool lisp_tokenize_esc_mode;`
			`void lisp_tokenize_init() {`
			`lisp_tokenize_buf[0] = '\0';`
			`lisp_tokenize_pos = 0;`
			`lisp_tokenize_quote_mode = false;`
			`lisp_tokenize_esc_mode = false;`
			`}`
			`void lisp_tokenize_next(lisp_cons** head) {`
			`lisp_tokenize_buf[lisp_tokenize_pos] = '\0';`
			`lisp_string* token_lstr = lisp_string_create(lisp_tokenize_buf);`
			`lisp_tokenize_buf[0] = '\0';`
			`lisp_tokenize_pos = 0;`

			`lisp_cons* tail = dbg_malloc(sizeof(lisp_cons));`
			`tail->car.type = LISP_T_STRING;`
			`tail->car.value.string = token_lstr;`
			`tail->cdr.type = LISP_T_CONS;`
			`tail->cdr.value.cons = NULL;`
			`if (*head != NULL)`
			`lisp_cons_join(*head, tail);`
			`else`
			`*head = tail;`
			`}`
			`void lisp_tokenize(char* buf, int len, lisp_cons** head) {`
			`for (int i = 0; i < len; i++) {`
			`if (lisp_tokenize_pos >= lisp_tokenize_len-1) {`
			`lisp_tokenize_pos = lisp_tokenize_len-1;`
			`lisp_tokenize_next(head);`
			`}`
			`char c = buf[i];`
			`if (lisp_tokenize_quote_mode) {`
			`lisp_tokenize_buf[lisp_tokenize_pos++] = c;`
			`if (c == '\\')`
			`lisp_tokenize_esc_mode = true;`
			`else if ((c == '"') & !lisp_tokenize_esc_mode) { // end of string`
			`lisp_tokenize_next(head);`
			`lisp_tokenize_quote_mode = false;`
			`} else`
			`lisp_tokenize_esc_mode = false;`
			`continue;`
			`}`

			`if (lisp_tokenize_esc_mode) {`
			`lisp_tokenize_buf[lisp_tokenize_pos++] = c;`
			`lisp_tokenize_esc_mode = false;`
			`continue;`
			`}`

			`if ((c == ' ') \|\| (c == '\t') \|\| (c == '\n') \|\| (c == '\r') \|\| (c == '\v') \|\| (c == '\f')) {`
			`if (lisp_tokenize_pos > 0) // whitespace, end of token`
			`lisp_tokenize_next(head);`
			`} else if (c == ')') {`
			`// end current token`
			`if (lisp_tokenize_pos > 0)`
			`lisp_tokenize_next(head);`
			`// ) is its own token`
			`lisp_tokenize_buf[lisp_tokenize_pos++] = c;`
			`lisp_tokenize_next(head);`
			`} else if (c == '(') {`
			`// ( is its own token`
			`lisp_tokenize_buf[lisp_tokenize_pos++] = c;`
			`lisp_tokenize_next(head);`
			`} else if ((c == '"') && (lisp_tokenize_pos == 0)) {`
			`// enable quotes mode`
			`lisp_tokenize_buf[lisp_tokenize_pos++] = c;`
			`lisp_tokenize_quote_mode = true;`
			`} else if (c == '\\') {`
			`lisp_tokenize_esc_mode = true;`
			`} else {`
			`lisp_tokenize_buf[lisp_tokenize_pos++] = c;`
			`}`
			`}`
			`}`

			`bool lisp_parse_number(lisp_string* token_lstr, lisp_value* value) {`
			`int token_len = lisp_string_len(token_lstr);`
			`char* token_data = lisp_string_data(token_lstr);`

			`bool neg = false;`
			`bool exp_neg = false;`
			`bool exp_mode = false;`
			`bool point = false;`
			`bool valid = false;`
			`long _int = 0;`
			`float _float = 0.0f;`
			`float weight = 0.1f;`
			`int exp_i = 0;`
			`int exp = 0;`
add let special form 2021-09-23 16:57:26 +00:00			`int base = 10;`
oopsie i forgot to commit :zany: 2021-09-22 13:54:29 +00:00			`for (int i = 0; i < token_len; i++) {`
			`char c = token_data[i];`
add let special form 2021-09-23 16:57:26 +00:00			`if (c >= 'a')`
			`c = c - ('a' - 'A');`
oopsie i forgot to commit :zany: 2021-09-22 13:54:29 +00:00			`if (!exp_mode) {`
			`if (c == '-' && i == 0) {`
			`neg = true;`
			`} else if (c >= '0' && c <= '9') {`
			`valid = true;`
			`if (!point)`
add let special form 2021-09-23 16:57:26 +00:00			`_int = (_int * base) + (c - '0');`
oopsie i forgot to commit :zany: 2021-09-22 13:54:29 +00:00			`else {`
			`_float += ((c - '0') * weight);`
			`weight /= 10.0f;`
			`}`
add let special form 2021-09-23 16:57:26 +00:00			`} else if (c >= 'A' && c <= 'F' && base == 16) {`
			`valid = true;`
			`_int = (_int * base) + (c - ('A' - 10));`
			`} else if (c == 'X') {`
			`base = 16;`
oopsie i forgot to commit :zany: 2021-09-22 13:54:29 +00:00			`} else if (c == '.' && !point) {`
			`point = true;`
			`_float = (float) _int;`
add let special form 2021-09-23 16:57:26 +00:00			`} else if (c == 'E') {`
oopsie i forgot to commit :zany: 2021-09-22 13:54:29 +00:00			`valid = false;`
			`exp_mode = true;`
			`if (!point)`
			`_float = (float) _int;`
			`point = true;`
			`exp_i = i+1;`
			`} else {`
			`valid = false;`
			`break;`
			`}`
			`} else {`
			`if (c == '-' && i == exp_i) {`
			`exp_neg = true;`
			`} else if (c >= '0' && c <= '9') {`
			`valid = true;`
			`exp = (exp * 10) + (c - '0');`
			`} else {`
			`valid = false;`
			`break;`
			`}`
			`}`
			`}`
			`if (valid && !point) {`
			`if (neg)`
			`_int = -_int;`
			`value->type = LISP_T_INT;`
			`value->value._int = _int;`
			`return true;`
			`} else if (valid) {`
			`if (neg)`
			`_float = -_float;`
			`if (exp_neg)`
			`exp = -exp;`
			`value->type = LISP_T_FLOAT;`
			`value->value._float = _float;`
			`if (exp_mode)`
			`value->value._float *= powf(10, exp);`
			`return true;`
			`}`
			`return false;`
			`}`
			`bool lisp_parse_string(lisp_string* token_lstr, lisp_value* value) {`
			`int token_len = lisp_string_len(token_lstr);`
			`char* token_data = lisp_string_data(token_lstr);`

			`if (token_data[0] != '"')`
			`return false;`

			`lisp_string* lstr = lisp_string_alloc(lisp_string_len(token_lstr) - 2);`
			`int lstr_len = 0;`
			`char* lstr_data = lisp_string_data(lstr);`

			`bool esc_mode = false;`
			`char esc_hex_mode = 0;`
			`char esc_hex = 0;`
			`for (int i = 1; i < token_len-1; i++) {`
			`char c = token_data[i];`
			`if (esc_mode) {`
			`esc_mode = false;`
			`switch (c) {`
			`case 'e':`
			`lstr_data[lstr_len++] = '\e';`
			`break;`
add let special form 2021-09-23 16:57:26 +00:00			`case 'r':`
			`lstr_data[lstr_len++] = '\r';`
oopsie i forgot to commit :zany: 2021-09-22 13:54:29 +00:00			`break;`
			`case 'n':`
			`lstr_data[lstr_len++] = '\n';`
			`break;`
			`case 't':`
			`lstr_data[lstr_len++] = '\t';`
			`break;`
add let special form 2021-09-23 16:57:26 +00:00			`case 'f':`
			`lstr_data[lstr_len++] = '\f';`
			`break;`
oopsie i forgot to commit :zany: 2021-09-22 13:54:29 +00:00			`case 'v':`
			`lstr_data[lstr_len++] = '\v';`
			`break;`
			`case 'x':`
			`esc_hex_mode = 2;`
			`esc_hex = 0;`
			`break;`
			`default:`
			`lstr_data[lstr_len++] = c;`
			`break;`
			`}`
			`} else if (esc_hex_mode > 0) {`
			`esc_hex_mode--;`
			`esc_hex <<= 4;`
			`if (c <= '9')`
			`esc_hex += (c - '0');`
			`else if (c <= 'F')`
			`esc_hex += (c - ('A'-10));`
			`else`
			`esc_hex += (c - ('a'-10));`

			`if (esc_hex_mode == 0)`
			`lstr_data[lstr_len++] = esc_hex;`
			`} else {`
			`if (c == '\\') {`
			`esc_mode = true;`
			`} else {`
			`lstr_data[lstr_len++] = c;`
			`}`
			`}`
			`}`
			`((int) lstr) = lstr_len;`
			`value->type = LISP_T_STRING;`
			`value->value.string = lstr;`
			`return true;`
			`}`
			`bool lisp_parse_scalar(lisp_string* token_lstr, lisp_value* value) {`
			`char* buf = lisp_string_data(token_lstr);`
			`if (buf[0] == '(' \|\| buf[0] == ')' \|\| buf[0] == '\'') // parentheses, quote`
			`return false;`
			`if (lisp_string_len(token_lstr) == 3 && buf[0] == 'n' && buf[1] == 'i' && buf[2] == 'l') { // nil`
			`value->type = LISP_T_CONS;`
			`value->value.cons = NULL;`
			`return true;`
			`}`

			`if (lisp_parse_string(token_lstr, value))`
			`return true;`
			`if (lisp_parse_number(token_lstr, value))`
			`return true;`
			`lisp_atom atom = lisp_atomize(token_lstr);`
			`value->type = LISP_T_ATOM;`
			`value->value.atom = atom;`
			`return true;`
			`}`

			`bool lisp_parse_recursive(lisp_cons** tokens, bool (fetch_tokens)(lisp_cons), lisp_value value) {`
			`while (*tokens == NULL) // fetch more tokens`
			`if ((*fetch_tokens)(tokens) == false)`
			`return false;`
			`lisp_cons* cons = lisp_cons_pop(tokens);`
			`lisp_string* token_lstr = cons->car.value.string;`
			`char* buf = lisp_string_data(token_lstr);`
			`if (buf[0] == '\'') {`
			`lisp_string* new_lstr = lisp_string_create_raw(lisp_string_data(token_lstr)+1, lisp_string_len(token_lstr)-1);`
			`dbg_free(token_lstr);`
			`cons->car.value.string = new_lstr;`
			`*tokens = cons;`
			`lisp_cons* quote_cons = dbg_malloc(sizeof(lisp_cons));`
			`quote_cons->car.type = LISP_T_ATOM;`
			`quote_cons->car.value.atom = atom_quote;`
			`quote_cons->cdr.type = LISP_T_CONS;`
			`quote_cons->cdr.value.cons = dbg_malloc(sizeof(lisp_cons));`
			`quote_cons->cdr.value.cons->cdr.type = LISP_T_CONS;`
			`quote_cons->cdr.value.cons->cdr.value.cons = NULL;`
			`if (!lisp_parse_recursive(tokens, fetch_tokens, &quote_cons->cdr.value.cons->car))`
			`return false;`
			`value->type = LISP_T_CONS;`
			`value->value.cons = quote_cons;`
			`return true;`
			`}`
			`if (buf[0] == '(') {`
			`buf = NULL;`
			`dbg_free(token_lstr);`
			`lisp_cons* head = NULL;`
			`lisp_cons* tail = NULL;`
			`lisp_cons* new_cons = cons;`
			`bool cdr_mode = false;`
			`while (lisp_parse_recursive(tokens, fetch_tokens, &new_cons->car)) {`
			`new_cons->cdr.type = LISP_T_CONS;`
			`new_cons->cdr.value.cons = NULL;`
			`if (head == NULL)`
			`head = new_cons;`
			`if (tail == NULL)`
			`tail = head;`
			`else {`
			`if (cdr_mode) {`
			`tail->cdr = new_cons->car;`
			`dbg_free(new_cons);`
			`} else {`
			`tail->cdr.value.cons = new_cons;`
			`tail = new_cons;`
			`}`
			`}`
			`new_cons = dbg_malloc(sizeof(lisp_cons));`

			`while (*tokens == NULL) // fetch more tokens`
			`if ((*fetch_tokens)(tokens) == false)`
			`return false;`
			`token_lstr = (*tokens)->car.value.string;`
			`if (lisp_string_len(token_lstr) == 1) {`
			`if (lisp_string_data(token_lstr)[0] == ')') {`
			`cons = lisp_cons_pop(tokens);`
			`dbg_free(cons);`
			`dbg_free(token_lstr);`
			`break;`
			`} else if (lisp_string_data(token_lstr)[0] == '.') {`
			`cons = lisp_cons_pop(tokens);`
			`dbg_free(cons);`
			`dbg_free(token_lstr);`
			`cdr_mode = true;`
			`}`
			`}`
			`}`
			`dbg_free(new_cons);`
			`value->type = LISP_T_CONS;`
			`value->value.cons = head;`

			`return true;`
			`}`
			`if (lisp_parse_scalar(token_lstr, value)) {`
			`dbg_free(cons);`
			`dbg_free(token_lstr);`
			`return true;`
			`}`
			`*tokens = cons;`
			`return false;`
			`}`