#include "tokenizer.h" #include #include #include #include static StringView StringViewOfNumberTillNextNonDigit(StringView* source) { StringView stringViewOfNumber = StringView_Slice(*source, 0, 0); while (source->length != 0 && isdigit(source->source[0])) { *source = StringView_Drop(*source, 1); stringViewOfNumber.length ++; } return stringViewOfNumber; } static int64_t* _StringView_FoldInt64(char c, int64_t* i) { *i = *i * 10 + (c - '0'); return i; } static double* _StringView_FoldDouble(char c, double* d) { *d = *d * 10 + (c - '0'); return d; } static Token _Tokenizer_ParseInt64(bool negative, StringView integerPart) { int64_t theInt64 = 0; StringView_FoldLeft(integerPart, &theInt64, (StringViewFoldFunction) _StringView_FoldInt64); return (Token) { .type = TOKENTYPE_INTEGER, .get = { .integer = theInt64 * (negative ? -1 : 1) } }; } static Token _Tokenizer_ParseDouble(bool negative, StringView integerPart, StringView decimalPart) { double theDouble = 0.0; StringView_FoldLeft(integerPart, &theDouble, (StringViewFoldFunction) _StringView_FoldDouble); double theDecimal = 0.0; StringView_FoldLeft(decimalPart, &theDecimal, (StringViewFoldFunction) _StringView_FoldDouble); double result = (negative ? -1 : 1) * (theDouble + theDecimal / pow(10.0, decimalPart.length)); return (Token) { .type = TOKENTYPE_DOUBLE, .get = { .decimal = result } }; } static Token _Tokenizer_NumberToken(StringView* source, TokenContext* current) { uint32_t token_length = 0; bool negative = false; if (StringView_StartsWith(*source, StringView_FromString("-"))) { negative = true; *source = StringView_Drop(*source, 1); token_length++; } StringView integerPart = StringViewOfNumberTillNextNonDigit(source); token_length += integerPart.length; bool has_point = false; if (source->length != 0 && source->source[0] == '.') { *source = StringView_Drop(*source, 1); has_point = true; token_length++; } StringView decimalPart = StringViewOfNumberTillNextNonDigit(source); token_length += decimalPart.length; Token token; if (has_point) { token = _Tokenizer_ParseDouble(negative, integerPart, decimalPart); } else { token = _Tokenizer_ParseInt64(negative, integerPart); } //udpate context token.context = *current; current->col += token_length; return token; } static bool _Tokenizer_IdentifierLetter(char c) { return isalnum(c); } static Token _Tokenizer_IdentifierToken(StringView* source, TokenContext* current) { StringView identifier = StringView_TakeWhile(*source, _Tokenizer_IdentifierLetter); *source = StringView_Drop(*source, identifier.length); Token token = (Token) { .type = TOKENTYPE_IDENTIFIER, .get = { .identifier = identifier, }, .context = *current }; current->col += identifier.length; return token; } static bool _Tokenizer_ContinueCommentFunction(char c) { return c != '\n'; } static bool _Tokenizer_ContinueStringFunction(char c) { return c != '"'; } static Token _Tokenizer_SimpleToken(StringView* source, TokenContext* current) { const char* literal_table[] = { "{", "}", "&", ":", "+", "->", "-", "*", "/", "|", "==", "!=", "<", "<=", ">", ">=", ",", ";", "bind", "as", "(", ")" }; const enum TokenType type_table[] = { TOKENTYPE_LEFT_BRACE, TOKENTYPE_RIGHT_BRACE, TOKENTYPE_AMPERSAND, TOKENTYPE_COLON, TOKENTYPE_PLUS, TOKENTYPE_ARROW, TOKENTYPE_MINUS, TOKENTYPE_MULTIPLY, TOKENTYPE_DIVIDE, TOKENTYPE_PIPE, TOKENTYPE_EQUALITY, TOKENTYPE_INEQUALITY, TOKENTYPE_LESSTHAN, TOKENTYPE_LESSEQUAL, TOKENTYPE_GREATERTHAN, TOKENTYPE_GREATEREQUAL, TOKENTYPE_COMMA, TOKENTYPE_SEMICOLON, TOKENTYPE_BIND, TOKENTYPE_AS, TOKENTYPE_LEFT_PAREN, TOKENTYPE_RIGHT_PAREN, }; for (size_t i = 0; i < sizeof(literal_table) / sizeof(literal_table[0]); i++) { StringView literal_view = StringView_FromString(literal_table[i]); if (StringView_StartsWith(*source, literal_view)) { *source = StringView_Drop(*source, literal_view.length); Token token = (Token) { .type = type_table[i], .get = { .identifier = STRINGVIEW_NONE }, .context = *current }; current->col += literal_view.length; return token; } } return TOKEN_NONE; } Token _Tokenizer_CommentToken(StringView* source, TokenContext* current) { StringView comment = StringView_SpanWhile(source, _Tokenizer_ContinueCommentFunction); Token token = (Token) { .type = TOKENTYPE_COMMENT, .get = { .identifier = comment }, .context = *current }; current->col += comment.length; return token; } Token _Tokenizer_StringToken(StringView* source, TokenContext* current) { *source = StringView_Drop(*source, 1); StringView string = StringView_SpanWhile(source, _Tokenizer_ContinueStringFunction); string.source--; string.length += 2; *source = StringView_Drop(*source, 1); Token token = (Token) { .type = TOKENTYPE_STRING, .get = { .identifier = string }, .context = *current }; size_t newline_count = StringView_Count(string, StringView_FromString("\n")); if(newline_count == 0) { current->col += string.length; } else { current->row += newline_count; current->col = 1; StringView last_newline_split; StringView_LastSplit(&last_newline_split, &string, StringView_FromString("\n")); current->col += last_newline_split.length; } return token; } Token Tokenizer_NextToken(StringView* source, TokenContext* context) { while (source->length != 0 && isspace(source->source[0])) { if (source->source[0] == '\n') { context->col = 1; context->row++; } else { context->col++; } 0[source] = StringView_Slice(*source, 1, source->length); } if (source->length == 0) { return TOKEN_NONE; } { Token simple_token = _Tokenizer_SimpleToken(source, context); if (simple_token.type != TOKENTYPE_NONE) { return simple_token; } } if (isdigit(source->source[0]) || StringView_StartsWith(*source, StringView_FromString("-"))) { // parse int/double return _Tokenizer_NumberToken(source, context); } else if (isalpha(source->source[0])) { // parse name return _Tokenizer_IdentifierToken(source, context); } else if (StringView_StartsWith(*source, StringView_FromString("#"))) { return _Tokenizer_CommentToken(source, context); } else if (StringView_StartsWith(*source, StringView_FromString("\""))) { return _Tokenizer_StringToken(source, context); } else { Token non_token = (Token) {.type = TOKENTYPE_ERROR, .get = {.error = *source }, .context = *context }; context->col++; return non_token; } } const char* TokenType_ToString(enum TokenType type) { switch (type) { case TOKENTYPE_NONE: return "TOKENTYPE_NONE"; case TOKENTYPE_INTEGER: return "TOKENTYPE_INTEGER"; case TOKENTYPE_DOUBLE: return "TOKENTYPE_DOUBLE"; case TOKENTYPE_IDENTIFIER: return "TOKENTYPE_IDENTIFIER"; case TOKENTYPE_LEFT_BRACE: return "TOKENTYPE_LEFT_BRACE"; case TOKENTYPE_RIGHT_BRACE: return "TOKENTYPE_RIGHT_BRACE"; case TOKENTYPE_AMPERSAND: return "TOKENTYPE_AMPERSAND"; case TOKENTYPE_PLUS: return "TOKENTYPE_PLUS"; case TOKENTYPE_MINUS: return "TOKENTYPE_MINUS"; case TOKENTYPE_MULTIPLY: return "TOKENTYPE_MULTIPLY"; case TOKENTYPE_DIVIDE: return "TOKENTYPE_DIVIDE"; case TOKENTYPE_PIPE: return "TOKENTYPE_PIPE"; case TOKENTYPE_ARROW: return "TOKENTYPE_ARROW"; case TOKENTYPE_COLON: return "TOKENTYPE_COLON"; case TOKENTYPE_ERROR: return "TOKENTYPE_ERROR"; case TOKENTYPE_EQUALITY: return "TOKENTYPE_EQUALITY"; case TOKENTYPE_INEQUALITY: return "TOKENTYPE_INEQUALITY"; case TOKENTYPE_LESSTHAN: return "TOKENTYPE_LESSTHAN"; case TOKENTYPE_LESSEQUAL: return "TOKENTYPE_LESSEQUAL"; case TOKENTYPE_GREATERTHAN: return "TOKENTYPE_GREATERTHAN"; case TOKENTYPE_GREATEREQUAL: return "TOKENTYPE_GREATEREQUAL"; case TOKENTYPE_COMMA: return "TOKENTYPE_COMMA"; case TOKENTYPE_SEMICOLON: return "TOKENTYPE_SEMICOLON"; case TOKENTYPE_BIND: return "BIND"; case TOKENTYPE_AS: return "AS"; case TOKENTYPE_LEFT_PAREN: return "LEFT_PAREN"; case TOKENTYPE_RIGHT_PAREN: return "RIGHT_PAREN"; case TOKENTYPE_COMMENT: return "COMMENT"; case TOKENTYPE_STRING: return "STRING"; } return "INVALID"; } void _TokenContent_Print(Token* token) { switch (token->type) { case TOKENTYPE_INTEGER: printf("%" PRIi64, token->get.integer); break; case TOKENTYPE_DOUBLE: printf("%f", token->get.decimal); break; case TOKENTYPE_IDENTIFIER: case TOKENTYPE_COMMENT: case TOKENTYPE_STRING: case TOKENTYPE_ERROR: printf("%s", token->get.identifier.source); break; case TOKENTYPE_NONE: case TOKENTYPE_LEFT_BRACE: case TOKENTYPE_RIGHT_BRACE: case TOKENTYPE_AMPERSAND: case TOKENTYPE_PLUS: case TOKENTYPE_MINUS: case TOKENTYPE_MULTIPLY: case TOKENTYPE_DIVIDE: case TOKENTYPE_PIPE: case TOKENTYPE_ARROW: case TOKENTYPE_COLON: case TOKENTYPE_SEMICOLON: case TOKENTYPE_EQUALITY: case TOKENTYPE_INEQUALITY: case TOKENTYPE_LESSTHAN: case TOKENTYPE_LESSEQUAL: case TOKENTYPE_GREATERTHAN: case TOKENTYPE_GREATEREQUAL: case TOKENTYPE_COMMA: case TOKENTYPE_BIND: case TOKENTYPE_AS: case TOKENTYPE_LEFT_PAREN: case TOKENTYPE_RIGHT_PAREN: break; } } void Token_Print(Token* token) { printf("Token: type: %s, TokenContent: ", TokenType_ToString(token->type)); _TokenContent_Print(token); printf(" , contex: row: %i, col: %i\n", token->context.row, token->context.col); }