flup/src/tokenizer.c

#include "tokenizer.h"
#include <ctype.h>
#include <math.h>
#include <stdio.h>
#include <inttypes.h>

static StringView StringViewOfNumberTillNextNonDigit(StringView* source) {
	StringView stringViewOfNumber = StringView_Slice(*source, 0, 0);
	while (source->length != 0 && isdigit(source->source[0])) {
		*source = StringView_Drop(*source, 1);
		stringViewOfNumber.length ++;
	}
	return stringViewOfNumber;
}

static int64_t* _StringView_FoldInt64(char c, int64_t* i)
{
	*i = *i * 10 + (c - '0');
	return i;
}

static double* _StringView_FoldDouble(char c, double* d)
{
	*d = *d * 10 + (c - '0');
	return d;
}

static Token _Tokenizer_ParseInt64(bool negative, StringView integerPart)
{
	int64_t theInt64 = 0;
	StringView_FoldLeft(integerPart, &theInt64, (StringViewFoldFunction) _StringView_FoldInt64);

	return (Token) {
		.type = TOKENTYPE_INTEGER,
		.get = {
			.integer = theInt64 * (negative ? -1 : 1)
		}
	};
}

static Token _Tokenizer_ParseDouble(bool negative, StringView integerPart, StringView decimalPart)
{
	double theDouble = 0.0;
	StringView_FoldLeft(integerPart, &theDouble, (StringViewFoldFunction) _StringView_FoldDouble);
	double theDecimal = 0.0;
	StringView_FoldLeft(decimalPart, &theDecimal, (StringViewFoldFunction) _StringView_FoldDouble);

	double result = (negative ? -1 : 1) * (theDouble + theDecimal / pow(10.0, decimalPart.length));

	return (Token) {
		.type = TOKENTYPE_DOUBLE,
		.get = {
			.decimal = result
		}
	};
}

static Token _Tokenizer_NumberToken(StringView* source, TokenContext* current)
{
	uint32_t token_length = 0;
	bool negative = false;
	if (StringView_StartsWith(*source, StringView_FromString("-"))) {
		negative = true;
		*source = StringView_Drop(*source, 1);
		token_length++;
	}

	StringView integerPart = StringViewOfNumberTillNextNonDigit(source);
	token_length += integerPart.length;
	bool has_point = false;
	if (source->length != 0 && source->source[0] == '.') {
		*source = StringView_Drop(*source, 1);
		has_point = true;
		token_length++;
	}
	StringView decimalPart = StringViewOfNumberTillNextNonDigit(source);
	token_length += decimalPart.length;

	Token token;

	if (has_point) {
		token = _Tokenizer_ParseDouble(negative, integerPart, decimalPart);
	} else {
		token = _Tokenizer_ParseInt64(negative, integerPart);
	}

	//udpate context
	token.context = *current;
	current->col += token_length;

	return token;
}

static bool _Tokenizer_IdentifierLetter(char c)
{
	return isalnum(c);
}

static Token _Tokenizer_IdentifierToken(StringView* source, TokenContext* current)
{
	StringView identifier = StringView_TakeWhile(*source, _Tokenizer_IdentifierLetter);
	*source = StringView_Drop(*source, identifier.length);

	Token token =  (Token) {
		.type = TOKENTYPE_IDENTIFIER,
		.get = {
			.identifier = identifier,
		},
		.context = *current
	};

	current->col += identifier.length;
	return token;
}

static bool _Tokenizer_ContinueCommentFunction(char c)
{
	return c != '\n';
}

static bool _Tokenizer_ContinueStringFunction(char c)
{
	return c != '"';
}

static Token _Tokenizer_SimpleToken(StringView* source, TokenContext* current)
{
	const char* literal_table[] = { "{", "}", "&", ":", "+", "->", "-", "*", "/", "|", "==", "!=", "<", "<=", ">", ">=", ",", ";", "bind", "as", "(", ")" };
	const enum TokenType type_table[] = {
		TOKENTYPE_LEFT_BRACE,
		TOKENTYPE_RIGHT_BRACE,
		TOKENTYPE_AMPERSAND,
		TOKENTYPE_COLON,
		TOKENTYPE_PLUS,
		TOKENTYPE_ARROW,
		TOKENTYPE_MINUS,
		TOKENTYPE_MULTIPLY,
		TOKENTYPE_DIVIDE,
		TOKENTYPE_PIPE,
		TOKENTYPE_EQUALITY,
		TOKENTYPE_INEQUALITY,
		TOKENTYPE_LESSTHAN,
		TOKENTYPE_LESSEQUAL,
		TOKENTYPE_GREATERTHAN,
		TOKENTYPE_GREATEREQUAL,
		TOKENTYPE_COMMA,
		TOKENTYPE_SEMICOLON,
		TOKENTYPE_BIND,
		TOKENTYPE_AS,
		TOKENTYPE_LEFT_PAREN,
		TOKENTYPE_RIGHT_PAREN,
	};

	for (size_t i = 0; i < sizeof(literal_table) / sizeof(literal_table[0]); i++) {
		StringView literal_view = StringView_FromString(literal_table[i]);
		if (StringView_StartsWith(*source, literal_view)) {
			*source = StringView_Drop(*source, literal_view.length);
			Token token = (Token) {
				.type = type_table[i],
				.get = { .identifier = STRINGVIEW_NONE },
				.context = *current
			};
			current->col += literal_view.length;
			return token;
		}
	}
	return TOKEN_NONE;
}

Token _Tokenizer_CommentToken(StringView* source, TokenContext* current)
{
	StringView comment = StringView_SpanWhile(source, _Tokenizer_ContinueCommentFunction);

	Token token = (Token) {
		.type = TOKENTYPE_COMMENT,
		.get = { .identifier = comment },
		.context = *current
	};
	current->col += comment.length;
	return token;
}

Token _Tokenizer_StringToken(StringView* source, TokenContext* current)
{
	*source = StringView_Drop(*source, 1);
	StringView string = StringView_SpanWhile(source, _Tokenizer_ContinueStringFunction);
	string.source--;
	string.length += 2;
	*source = StringView_Drop(*source, 1);

	Token token = (Token) {
		.type = TOKENTYPE_STRING,
		.get = { .identifier = string },
		.context = *current
	};

	size_t newline_count = StringView_Count(string, StringView_FromString("\n"));
	if(newline_count == 0) {
		current->col += string.length;
	} else {
		current->row += newline_count;
		current->col = 1;
		StringView last_newline_split;
		StringView_LastSplit(&last_newline_split, &string, StringView_FromString("\n"));
		current->col += last_newline_split.length;
	}
	return token;
}

Token Tokenizer_NextToken(StringView* source, TokenContext* context)
{
	while (source->length != 0 && isspace(source->source[0])) {
		if (source->source[0] == '\n') {
			context->col = 1;
			context->row++;
		} else {
			context->col++;
		}
		0[source] = StringView_Slice(*source, 1, source->length);
	}

	if (source->length == 0) {
		return TOKEN_NONE;
	}

	{
		Token simple_token = _Tokenizer_SimpleToken(source, context);
		if (simple_token.type != TOKENTYPE_NONE) {
			return simple_token;
		}
	}

	if (isdigit(source->source[0]) || StringView_StartsWith(*source, StringView_FromString("-"))) {
		// parse int/double
		return _Tokenizer_NumberToken(source, context);
	} else if (isalpha(source->source[0])) {
		// parse name
		return _Tokenizer_IdentifierToken(source, context);
	} else if (StringView_StartsWith(*source, StringView_FromString("#"))) {
			return _Tokenizer_CommentToken(source, context);
	} else if (StringView_StartsWith(*source, StringView_FromString("\""))) {
			return _Tokenizer_StringToken(source, context);
	} else {
		Token non_token = (Token) {.type = TOKENTYPE_ERROR, .get = {.error = *source }, .context = *context };
		context->col++;
		return non_token;
	}
}

const char* TokenType_ToString(enum TokenType type)
{
	switch (type) {
		case TOKENTYPE_NONE:
			return "TOKENTYPE_NONE";
		case TOKENTYPE_INTEGER:
			return "TOKENTYPE_INTEGER";
		case TOKENTYPE_DOUBLE:
			return "TOKENTYPE_DOUBLE";
		case TOKENTYPE_IDENTIFIER:
			return "TOKENTYPE_IDENTIFIER";
		case TOKENTYPE_LEFT_BRACE:
			return "TOKENTYPE_LEFT_BRACE";
		case TOKENTYPE_RIGHT_BRACE:
			return "TOKENTYPE_RIGHT_BRACE";
		case TOKENTYPE_AMPERSAND:
			return "TOKENTYPE_AMPERSAND";
		case TOKENTYPE_PLUS:
			return "TOKENTYPE_PLUS";
		case TOKENTYPE_MINUS:
			return "TOKENTYPE_MINUS";
		case TOKENTYPE_MULTIPLY:
			return "TOKENTYPE_MULTIPLY";
		case TOKENTYPE_DIVIDE:
			return "TOKENTYPE_DIVIDE";
		case TOKENTYPE_PIPE:
			return "TOKENTYPE_PIPE";
		case TOKENTYPE_ARROW:
			return "TOKENTYPE_ARROW";
		case TOKENTYPE_COLON:
			return "TOKENTYPE_COLON";
		case TOKENTYPE_ERROR:
			return "TOKENTYPE_ERROR";
		case TOKENTYPE_EQUALITY:
			return "TOKENTYPE_EQUALITY";
		case TOKENTYPE_INEQUALITY:
			return "TOKENTYPE_INEQUALITY";
		case TOKENTYPE_LESSTHAN:
			return "TOKENTYPE_LESSTHAN";
		case TOKENTYPE_LESSEQUAL:
			return "TOKENTYPE_LESSEQUAL";
		case TOKENTYPE_GREATERTHAN:
			return "TOKENTYPE_GREATERTHAN";
		case TOKENTYPE_GREATEREQUAL:
			return "TOKENTYPE_GREATEREQUAL";
		case TOKENTYPE_COMMA:
			return "TOKENTYPE_COMMA";
		case TOKENTYPE_SEMICOLON:
			return "TOKENTYPE_SEMICOLON";
		case TOKENTYPE_BIND:
			return "BIND";
		case TOKENTYPE_AS:
			return "AS";
		case TOKENTYPE_LEFT_PAREN:
			return "LEFT_PAREN";
		case TOKENTYPE_RIGHT_PAREN:
			return "RIGHT_PAREN";
		case TOKENTYPE_COMMENT:
			return "COMMENT";
		case TOKENTYPE_STRING:
			return "STRING";
                }

        return "INVALID";
}

void _TokenContent_Print(Token* token)
{
	switch (token->type) {
		case TOKENTYPE_INTEGER:
			printf("%" PRIi64, token->get.integer);
			break;
		case TOKENTYPE_DOUBLE:
			printf("%f", token->get.decimal);
			break;
		case TOKENTYPE_IDENTIFIER:
		case TOKENTYPE_COMMENT:
		case TOKENTYPE_STRING:
		case TOKENTYPE_ERROR:
			printf("%s", token->get.identifier.source);
			break;
		case TOKENTYPE_NONE:
		case TOKENTYPE_LEFT_BRACE:
		case TOKENTYPE_RIGHT_BRACE:
		case TOKENTYPE_AMPERSAND:
		case TOKENTYPE_PLUS:
		case TOKENTYPE_MINUS:
		case TOKENTYPE_MULTIPLY:
		case TOKENTYPE_DIVIDE:
		case TOKENTYPE_PIPE:
		case TOKENTYPE_ARROW:
		case TOKENTYPE_COLON:
		case TOKENTYPE_SEMICOLON:
		case TOKENTYPE_EQUALITY:
		case TOKENTYPE_INEQUALITY:
		case TOKENTYPE_LESSTHAN:
		case TOKENTYPE_LESSEQUAL:
		case TOKENTYPE_GREATERTHAN:
		case TOKENTYPE_GREATEREQUAL:
		case TOKENTYPE_COMMA:
		case TOKENTYPE_BIND:
		case TOKENTYPE_AS:
		case TOKENTYPE_LEFT_PAREN:
		case TOKENTYPE_RIGHT_PAREN:
			break;
	}
}

void Token_Print(Token* token)
{
	printf("Token: type: %s, TokenContent: ", TokenType_ToString(token->type));
	_TokenContent_Print(token);
	printf(" , contex: row: %i, col: %i\n", token->context.row, token->context.col);
}