keithlisp/parse.c

#include "parse.h"

#define lisp_tokenize_len 64
char lisp_tokenize_buf[lisp_tokenize_len];
int lisp_tokenize_pos;
bool lisp_tokenize_quote_mode;
bool lisp_tokenize_esc_mode;
void lisp_tokenize_init() {
	lisp_tokenize_buf[0] = '\0';
	lisp_tokenize_pos = 0;
	lisp_tokenize_quote_mode = false;
	lisp_tokenize_esc_mode = false;
}
void lisp_tokenize_next(lisp_cons** head) {
	lisp_tokenize_buf[lisp_tokenize_pos] = '\0';
	lisp_string* token_lstr = lisp_string_create(lisp_tokenize_buf);
	lisp_tokenize_buf[0] = '\0';
	lisp_tokenize_pos = 0;

	lisp_cons* tail = dbg_malloc(sizeof(lisp_cons));
	tail->car.type = LISP_T_STRING;
	tail->car.value.string = token_lstr;
	tail->cdr.type = LISP_T_CONS;
	tail->cdr.value.cons = NULL;
	if (*head != NULL)
		lisp_cons_join(*head, tail);
	else
		*head = tail;
}
void lisp_tokenize(char* buf, int len, lisp_cons** head) {
	for (int i = 0; i < len; i++) {
		if (lisp_tokenize_pos >= lisp_tokenize_len-1) {
			lisp_tokenize_pos = lisp_tokenize_len-1;
			lisp_tokenize_next(head);
		}
		char c = buf[i];
		if (lisp_tokenize_quote_mode) {
			lisp_tokenize_buf[lisp_tokenize_pos++] = c;
			if (c == '\\')
				lisp_tokenize_esc_mode = true;
			else if ((c == '"') & !lisp_tokenize_esc_mode) { // end of string
				lisp_tokenize_next(head);
				lisp_tokenize_quote_mode = false;
			} else
				lisp_tokenize_esc_mode = false;
			continue;
		}

		if (lisp_tokenize_esc_mode) {
			lisp_tokenize_buf[lisp_tokenize_pos++] = c;
			lisp_tokenize_esc_mode = false;
			continue;
		}

		if ((c == ' ') || (c == '\t') || (c == '\n') || (c == '\r') || (c == '\v') || (c == '\f')) {
			if (lisp_tokenize_pos > 0) // whitespace, end of token
				lisp_tokenize_next(head);
		} else if (c == ')') {
			// end current token
			if (lisp_tokenize_pos > 0)
				lisp_tokenize_next(head);
			// ) is its own token
			lisp_tokenize_buf[lisp_tokenize_pos++] = c;
			lisp_tokenize_next(head);
		} else if (c == '(') {
			// ( is its own token
			lisp_tokenize_buf[lisp_tokenize_pos++] = c;
			lisp_tokenize_next(head);
		} else if ((c == '"') && (lisp_tokenize_pos == 0)) {
			// enable quotes mode
			lisp_tokenize_buf[lisp_tokenize_pos++] = c;
			lisp_tokenize_quote_mode = true;
		} else if (c == '\\') {
			lisp_tokenize_esc_mode = true;
		} else {
			lisp_tokenize_buf[lisp_tokenize_pos++] = c;
		}
	}
}

bool lisp_parse_number(lisp_string* token_lstr, lisp_value* value) {
	int token_len = lisp_string_len(token_lstr);
	char* token_data = lisp_string_data(token_lstr);

	bool neg = false;
	bool exp_neg = false;
	bool exp_mode = false;
	bool point = false;
	bool valid = false;
	long _int = 0;
	float _float = 0.0f;
	float weight = 0.1f;
	int exp_i = 0;
	int exp = 0;
	int base = 10;
	for (int i = 0; i < token_len; i++) {
		char c = token_data[i];
		if (c >= 'a')
			c = c - ('a' - 'A');
		if (!exp_mode) {
			if (c == '-' && i == 0) {
				neg = true;
			} else if (c >= '0' && c <= '9') {
				valid = true;
				if (!point)
					_int = (_int * base) + (c - '0');
				else {
					_float += ((c - '0') * weight);
					weight /= 10.0f;
				}
			} else if (c >= 'A' && c <= 'F' && base == 16) {
				valid = true;
				_int = (_int * base) + (c - ('A' - 10));
			} else if (c == 'X') {
				base = 16;
			} else if (c == '.' && !point) {
				point = true;
				_float = (float) _int;
			} else if (c == 'E') {
				valid = false;
				exp_mode = true;
				if (!point)
					_float = (float) _int;
				point = true;
				exp_i = i+1;
			} else {
				valid = false;
				break;
			}
		} else {
			if (c == '-' && i == exp_i) {
				exp_neg = true;
			} else if (c >= '0' && c <= '9') {
				valid = true;
				exp = (exp * 10) + (c - '0');
			} else {
				valid = false;
				break;
			}
		}
	}
	if (valid && !point) {
		if (neg)
			_int = -_int;
		value->type = LISP_T_INT;
		value->value._int = _int;
		return true;
	} else if (valid) {
		if (neg)
			_float = -_float;
		if (exp_neg)
			exp = -exp;
		value->type = LISP_T_FLOAT;
		value->value._float = _float;
		if (exp_mode)
			value->value._float *= powf(10, exp);
		return true;
	}
	return false;
}
bool lisp_parse_string(lisp_string* token_lstr, lisp_value* value) {
	int token_len = lisp_string_len(token_lstr);
	char* token_data = lisp_string_data(token_lstr);

	if (token_data[0] != '"')
		return false;

	lisp_string* lstr = lisp_string_alloc(lisp_string_len(token_lstr) - 2);
	int lstr_len = 0;
	char* lstr_data = lisp_string_data(lstr);

	bool esc_mode = false;
	char esc_hex_mode = 0;
	char esc_hex = 0;
	for (int i = 1; i < token_len-1; i++) {
		char c = token_data[i];
		if (esc_mode) {
			esc_mode = false;
			switch (c) {
				case 'e':
					lstr_data[lstr_len++] = '\e';
					break;
				case 'r':
					lstr_data[lstr_len++] = '\r';
					break;
				case 'n':
					lstr_data[lstr_len++] = '\n';
					break;
				case 't':
					lstr_data[lstr_len++] = '\t';
					break;
				case 'f':
					lstr_data[lstr_len++] = '\f';
					break;
				case 'v':
					lstr_data[lstr_len++] = '\v';
					break;
				case 'x':
					esc_hex_mode = 2;
					esc_hex = 0;
					break;
				default:
					lstr_data[lstr_len++] = c;
					break;
			}
		} else if (esc_hex_mode > 0) {
			esc_hex_mode--;
			esc_hex <<= 4;
			if (c <= '9')
				esc_hex += (c - '0');
			else if (c <= 'F')
				esc_hex += (c - ('A'-10));
			else
				esc_hex += (c - ('a'-10));

			if (esc_hex_mode == 0)
				lstr_data[lstr_len++] = esc_hex;
		} else {
			if (c == '\\') {
				esc_mode = true;
			} else {
				lstr_data[lstr_len++] = c;
			}
		}
	}
	*((int*) lstr) = lstr_len;
	value->type = LISP_T_STRING;
	value->value.string = lstr;
	return true;
}
bool lisp_parse_scalar(lisp_string* token_lstr, lisp_value* value) {
	char* buf = lisp_string_data(token_lstr);
	if (buf[0] == '(' || buf[0] == ')' || buf[0] == '\'') // parentheses, quote
		return false;
	if (lisp_string_len(token_lstr) == 3 && buf[0] == 'n' && buf[1] == 'i' && buf[2] == 'l') { // nil
		value->type = LISP_T_CONS;
		value->value.cons = NULL;
		return true;
	}

	if (lisp_parse_string(token_lstr, value))
		return true;
	if (lisp_parse_number(token_lstr, value))
		return true;
	lisp_atom atom = lisp_atomize(token_lstr);
	value->type = LISP_T_ATOM;
	value->value.atom = atom;
	return true;
}

bool lisp_parse_recursive(lisp_cons** tokens, bool (*fetch_tokens)(lisp_cons**), lisp_value* value) {
	while (*tokens == NULL) // fetch more tokens
		if ((*fetch_tokens)(tokens) == false)
			return false;
	lisp_cons* cons = lisp_cons_pop(tokens);
	lisp_string* token_lstr = cons->car.value.string;
	char* buf = lisp_string_data(token_lstr);
	if (buf[0] == '\'') {
		lisp_string* new_lstr = lisp_string_create_raw(lisp_string_data(token_lstr)+1, lisp_string_len(token_lstr)-1);
		dbg_free(token_lstr);
		cons->car.value.string = new_lstr;
		*tokens = cons;
		lisp_cons* quote_cons = dbg_malloc(sizeof(lisp_cons));
		quote_cons->car.type = LISP_T_ATOM;
		quote_cons->car.value.atom = atom_quote;
		quote_cons->cdr.type = LISP_T_CONS;
		quote_cons->cdr.value.cons = dbg_malloc(sizeof(lisp_cons));
		quote_cons->cdr.value.cons->cdr.type = LISP_T_CONS;
		quote_cons->cdr.value.cons->cdr.value.cons = NULL;
		if (!lisp_parse_recursive(tokens, fetch_tokens, &quote_cons->cdr.value.cons->car))
			return false;
		value->type = LISP_T_CONS;
		value->value.cons = quote_cons;
		return true;
	}
	if (buf[0] == '(') {
		buf = NULL;
		dbg_free(token_lstr);
		lisp_cons* head = NULL;
		lisp_cons* tail = NULL;
		lisp_cons* new_cons = cons;
		bool cdr_mode = false;
		while (lisp_parse_recursive(tokens, fetch_tokens, &new_cons->car)) {
			new_cons->cdr.type = LISP_T_CONS;
			new_cons->cdr.value.cons = NULL;
			if (head == NULL)
				head = new_cons;
			if (tail == NULL)
				tail = head;
			else {
				if (cdr_mode) {
					tail->cdr = new_cons->car;
					dbg_free(new_cons);
				} else {
					tail->cdr.value.cons = new_cons;
					tail = new_cons;
				}
			}
			new_cons = dbg_malloc(sizeof(lisp_cons));

			while (*tokens == NULL) // fetch more tokens
				if ((*fetch_tokens)(tokens) == false)
					return false;
			token_lstr = (*tokens)->car.value.string;
			if (lisp_string_len(token_lstr) == 1) {
				if (lisp_string_data(token_lstr)[0] == ')') {
					cons = lisp_cons_pop(tokens);
					dbg_free(cons);
					dbg_free(token_lstr);
					break;
				} else if (lisp_string_data(token_lstr)[0] == '.') {
					cons = lisp_cons_pop(tokens);
					dbg_free(cons);
					dbg_free(token_lstr);
					cdr_mode = true;
				}
			}
		}
		dbg_free(new_cons);
		value->type = LISP_T_CONS;
		value->value.cons = head;

		return true;
	}
	if (lisp_parse_scalar(token_lstr, value)) {
		dbg_free(cons);
		dbg_free(token_lstr);
		return true;
	}
	*tokens = cons;
	return false;
}