365 lines
9.3 KiB
C
365 lines
9.3 KiB
C
#include "tokenizer.h"
|
|
#include <ctype.h>
|
|
#include <math.h>
|
|
#include <stdio.h>
|
|
#include <inttypes.h>
|
|
|
|
static StringView StringViewOfNumberTillNextNonDigit(StringView* source) {
|
|
StringView stringViewOfNumber = StringView_Slice(*source, 0, 0);
|
|
while (source->length != 0 && isdigit(source->source[0])) {
|
|
*source = StringView_Drop(*source, 1);
|
|
stringViewOfNumber.length ++;
|
|
}
|
|
return stringViewOfNumber;
|
|
}
|
|
|
|
static int64_t* _StringView_FoldInt64(char c, int64_t* i)
|
|
{
|
|
*i = *i * 10 + (c - '0');
|
|
return i;
|
|
}
|
|
|
|
static double* _StringView_FoldDouble(char c, double* d)
|
|
{
|
|
*d = *d * 10 + (c - '0');
|
|
return d;
|
|
}
|
|
|
|
static Token _Tokenizer_ParseInt64(bool negative, StringView integerPart)
|
|
{
|
|
int64_t theInt64 = 0;
|
|
StringView_FoldLeft(integerPart, &theInt64, (StringViewFoldFunction) _StringView_FoldInt64);
|
|
|
|
return (Token) {
|
|
.type = TOKENTYPE_INTEGER,
|
|
.get = {
|
|
.integer = theInt64 * (negative ? -1 : 1)
|
|
}
|
|
};
|
|
}
|
|
|
|
static Token _Tokenizer_ParseDouble(bool negative, StringView integerPart, StringView decimalPart)
|
|
{
|
|
double theDouble = 0.0;
|
|
StringView_FoldLeft(integerPart, &theDouble, (StringViewFoldFunction) _StringView_FoldDouble);
|
|
double theDecimal = 0.0;
|
|
StringView_FoldLeft(decimalPart, &theDecimal, (StringViewFoldFunction) _StringView_FoldDouble);
|
|
|
|
double result = (negative ? -1 : 1) * (theDouble + theDecimal / pow(10.0, decimalPart.length));
|
|
|
|
return (Token) {
|
|
.type = TOKENTYPE_DOUBLE,
|
|
.get = {
|
|
.decimal = result
|
|
}
|
|
};
|
|
}
|
|
|
|
static Token _Tokenizer_NumberToken(StringView* source, TokenContext* current)
|
|
{
|
|
uint32_t token_length = 0;
|
|
bool negative = false;
|
|
if (StringView_StartsWith(*source, StringView_FromString("-"))) {
|
|
negative = true;
|
|
*source = StringView_Drop(*source, 1);
|
|
token_length++;
|
|
}
|
|
|
|
StringView integerPart = StringViewOfNumberTillNextNonDigit(source);
|
|
token_length += integerPart.length;
|
|
bool has_point = false;
|
|
if (source->length != 0 && source->source[0] == '.') {
|
|
*source = StringView_Drop(*source, 1);
|
|
has_point = true;
|
|
token_length++;
|
|
}
|
|
StringView decimalPart = StringViewOfNumberTillNextNonDigit(source);
|
|
token_length += decimalPart.length;
|
|
|
|
Token token;
|
|
|
|
if (has_point) {
|
|
token = _Tokenizer_ParseDouble(negative, integerPart, decimalPart);
|
|
} else {
|
|
token = _Tokenizer_ParseInt64(negative, integerPart);
|
|
}
|
|
|
|
//udpate context
|
|
token.context = *current;
|
|
current->col += token_length;
|
|
|
|
return token;
|
|
}
|
|
|
|
static bool _Tokenizer_IdentifierLetter(char c)
|
|
{
|
|
return isalnum(c);
|
|
}
|
|
|
|
static Token _Tokenizer_IdentifierToken(StringView* source, TokenContext* current)
|
|
{
|
|
StringView identifier = StringView_TakeWhile(*source, _Tokenizer_IdentifierLetter);
|
|
*source = StringView_Drop(*source, identifier.length);
|
|
|
|
Token token = (Token) {
|
|
.type = TOKENTYPE_IDENTIFIER,
|
|
.get = {
|
|
.identifier = identifier,
|
|
},
|
|
.context = *current
|
|
};
|
|
|
|
current->col += identifier.length;
|
|
return token;
|
|
}
|
|
|
|
static bool _Tokenizer_ContinueCommentFunction(char c)
|
|
{
|
|
return c != '\n';
|
|
}
|
|
|
|
static bool _Tokenizer_ContinueStringFunction(char c)
|
|
{
|
|
return c != '"';
|
|
}
|
|
|
|
static Token _Tokenizer_SimpleToken(StringView* source, TokenContext* current)
|
|
{
|
|
const char* literal_table[] = { "{", "}", "&", ":", "+", "->", "-", "*", "/", "|", "==", "!=", "<", "<=", ">", ">=", ",", ";", "bind", "as", "(", ")" };
|
|
const enum TokenType type_table[] = {
|
|
TOKENTYPE_LEFT_BRACE,
|
|
TOKENTYPE_RIGHT_BRACE,
|
|
TOKENTYPE_AMPERSAND,
|
|
TOKENTYPE_COLON,
|
|
TOKENTYPE_PLUS,
|
|
TOKENTYPE_ARROW,
|
|
TOKENTYPE_MINUS,
|
|
TOKENTYPE_MULTIPLY,
|
|
TOKENTYPE_DIVIDE,
|
|
TOKENTYPE_PIPE,
|
|
TOKENTYPE_EQUALITY,
|
|
TOKENTYPE_INEQUALITY,
|
|
TOKENTYPE_LESSTHAN,
|
|
TOKENTYPE_LESSEQUAL,
|
|
TOKENTYPE_GREATERTHAN,
|
|
TOKENTYPE_GREATEREQUAL,
|
|
TOKENTYPE_COMMA,
|
|
TOKENTYPE_SEMICOLON,
|
|
TOKENTYPE_BIND,
|
|
TOKENTYPE_AS,
|
|
TOKENTYPE_LEFT_PAREN,
|
|
TOKENTYPE_RIGHT_PAREN,
|
|
};
|
|
|
|
for (size_t i = 0; i < sizeof(literal_table) / sizeof(literal_table[0]); i++) {
|
|
StringView literal_view = StringView_FromString(literal_table[i]);
|
|
if (StringView_StartsWith(*source, literal_view)) {
|
|
*source = StringView_Drop(*source, literal_view.length);
|
|
Token token = (Token) {
|
|
.type = type_table[i],
|
|
.get = { .identifier = STRINGVIEW_NONE },
|
|
.context = *current
|
|
};
|
|
current->col += literal_view.length;
|
|
return token;
|
|
}
|
|
}
|
|
return TOKEN_NONE;
|
|
}
|
|
|
|
Token _Tokenizer_CommentToken(StringView* source, TokenContext* current)
|
|
{
|
|
StringView comment = StringView_SpanWhile(source, _Tokenizer_ContinueCommentFunction);
|
|
|
|
Token token = (Token) {
|
|
.type = TOKENTYPE_COMMENT,
|
|
.get = { .identifier = comment },
|
|
.context = *current
|
|
};
|
|
current->col += comment.length;
|
|
return token;
|
|
}
|
|
|
|
Token _Tokenizer_StringToken(StringView* source, TokenContext* current)
|
|
{
|
|
*source = StringView_Drop(*source, 1);
|
|
StringView string = StringView_SpanWhile(source, _Tokenizer_ContinueStringFunction);
|
|
string.source--;
|
|
string.length += 2;
|
|
*source = StringView_Drop(*source, 1);
|
|
|
|
Token token = (Token) {
|
|
.type = TOKENTYPE_STRING,
|
|
.get = { .identifier = string },
|
|
.context = *current
|
|
};
|
|
|
|
size_t newline_count = StringView_Count(string, StringView_FromString("\n"));
|
|
if(newline_count == 0) {
|
|
current->col += string.length;
|
|
} else {
|
|
current->row += newline_count;
|
|
current->col = 1;
|
|
StringView last_newline_split;
|
|
StringView_LastSplit(&last_newline_split, &string, StringView_FromString("\n"));
|
|
current->col += last_newline_split.length;
|
|
}
|
|
return token;
|
|
}
|
|
|
|
Token Tokenizer_NextToken(StringView* source, TokenContext* context)
|
|
{
|
|
while (source->length != 0 && isspace(source->source[0])) {
|
|
if (source->source[0] == '\n') {
|
|
context->col = 1;
|
|
context->row++;
|
|
} else {
|
|
context->col++;
|
|
}
|
|
0[source] = StringView_Slice(*source, 1, source->length);
|
|
}
|
|
|
|
if (source->length == 0) {
|
|
return TOKEN_NONE;
|
|
}
|
|
|
|
{
|
|
Token simple_token = _Tokenizer_SimpleToken(source, context);
|
|
if (simple_token.type != TOKENTYPE_NONE) {
|
|
return simple_token;
|
|
}
|
|
}
|
|
|
|
if (isdigit(source->source[0]) || StringView_StartsWith(*source, StringView_FromString("-"))) {
|
|
// parse int/double
|
|
return _Tokenizer_NumberToken(source, context);
|
|
} else if (isalpha(source->source[0])) {
|
|
// parse name
|
|
return _Tokenizer_IdentifierToken(source, context);
|
|
} else if (StringView_StartsWith(*source, StringView_FromString("#"))) {
|
|
return _Tokenizer_CommentToken(source, context);
|
|
} else if (StringView_StartsWith(*source, StringView_FromString("\""))) {
|
|
return _Tokenizer_StringToken(source, context);
|
|
} else {
|
|
Token non_token = (Token) {.type = TOKENTYPE_ERROR, .get = {.error = *source }, .context = *context };
|
|
context->col++;
|
|
return non_token;
|
|
}
|
|
}
|
|
|
|
const char* TokenType_ToString(enum TokenType type)
|
|
{
|
|
switch (type) {
|
|
case TOKENTYPE_NONE:
|
|
return "TOKENTYPE_NONE";
|
|
case TOKENTYPE_INTEGER:
|
|
return "TOKENTYPE_INTEGER";
|
|
case TOKENTYPE_DOUBLE:
|
|
return "TOKENTYPE_DOUBLE";
|
|
case TOKENTYPE_IDENTIFIER:
|
|
return "TOKENTYPE_IDENTIFIER";
|
|
case TOKENTYPE_LEFT_BRACE:
|
|
return "TOKENTYPE_LEFT_BRACE";
|
|
case TOKENTYPE_RIGHT_BRACE:
|
|
return "TOKENTYPE_RIGHT_BRACE";
|
|
case TOKENTYPE_AMPERSAND:
|
|
return "TOKENTYPE_AMPERSAND";
|
|
case TOKENTYPE_PLUS:
|
|
return "TOKENTYPE_PLUS";
|
|
case TOKENTYPE_MINUS:
|
|
return "TOKENTYPE_MINUS";
|
|
case TOKENTYPE_MULTIPLY:
|
|
return "TOKENTYPE_MULTIPLY";
|
|
case TOKENTYPE_DIVIDE:
|
|
return "TOKENTYPE_DIVIDE";
|
|
case TOKENTYPE_PIPE:
|
|
return "TOKENTYPE_PIPE";
|
|
case TOKENTYPE_ARROW:
|
|
return "TOKENTYPE_ARROW";
|
|
case TOKENTYPE_COLON:
|
|
return "TOKENTYPE_COLON";
|
|
case TOKENTYPE_ERROR:
|
|
return "TOKENTYPE_ERROR";
|
|
case TOKENTYPE_EQUALITY:
|
|
return "TOKENTYPE_EQUALITY";
|
|
case TOKENTYPE_INEQUALITY:
|
|
return "TOKENTYPE_INEQUALITY";
|
|
case TOKENTYPE_LESSTHAN:
|
|
return "TOKENTYPE_LESSTHAN";
|
|
case TOKENTYPE_LESSEQUAL:
|
|
return "TOKENTYPE_LESSEQUAL";
|
|
case TOKENTYPE_GREATERTHAN:
|
|
return "TOKENTYPE_GREATERTHAN";
|
|
case TOKENTYPE_GREATEREQUAL:
|
|
return "TOKENTYPE_GREATEREQUAL";
|
|
case TOKENTYPE_COMMA:
|
|
return "TOKENTYPE_COMMA";
|
|
case TOKENTYPE_SEMICOLON:
|
|
return "TOKENTYPE_SEMICOLON";
|
|
case TOKENTYPE_BIND:
|
|
return "BIND";
|
|
case TOKENTYPE_AS:
|
|
return "AS";
|
|
case TOKENTYPE_LEFT_PAREN:
|
|
return "LEFT_PAREN";
|
|
case TOKENTYPE_RIGHT_PAREN:
|
|
return "RIGHT_PAREN";
|
|
case TOKENTYPE_COMMENT:
|
|
return "COMMENT";
|
|
case TOKENTYPE_STRING:
|
|
return "STRING";
|
|
}
|
|
|
|
return "INVALID";
|
|
}
|
|
|
|
void _TokenContent_Print(Token* token)
|
|
{
|
|
switch (token->type) {
|
|
case TOKENTYPE_INTEGER:
|
|
printf("%" PRIi64, token->get.integer);
|
|
break;
|
|
case TOKENTYPE_DOUBLE:
|
|
printf("%f", token->get.decimal);
|
|
break;
|
|
case TOKENTYPE_IDENTIFIER:
|
|
case TOKENTYPE_COMMENT:
|
|
case TOKENTYPE_STRING:
|
|
case TOKENTYPE_ERROR:
|
|
printf("%s", token->get.identifier.source);
|
|
break;
|
|
case TOKENTYPE_NONE:
|
|
case TOKENTYPE_LEFT_BRACE:
|
|
case TOKENTYPE_RIGHT_BRACE:
|
|
case TOKENTYPE_AMPERSAND:
|
|
case TOKENTYPE_PLUS:
|
|
case TOKENTYPE_MINUS:
|
|
case TOKENTYPE_MULTIPLY:
|
|
case TOKENTYPE_DIVIDE:
|
|
case TOKENTYPE_PIPE:
|
|
case TOKENTYPE_ARROW:
|
|
case TOKENTYPE_COLON:
|
|
case TOKENTYPE_SEMICOLON:
|
|
case TOKENTYPE_EQUALITY:
|
|
case TOKENTYPE_INEQUALITY:
|
|
case TOKENTYPE_LESSTHAN:
|
|
case TOKENTYPE_LESSEQUAL:
|
|
case TOKENTYPE_GREATERTHAN:
|
|
case TOKENTYPE_GREATEREQUAL:
|
|
case TOKENTYPE_COMMA:
|
|
case TOKENTYPE_BIND:
|
|
case TOKENTYPE_AS:
|
|
case TOKENTYPE_LEFT_PAREN:
|
|
case TOKENTYPE_RIGHT_PAREN:
|
|
break;
|
|
}
|
|
}
|
|
|
|
void Token_Print(Token* token)
|
|
{
|
|
printf("Token: type: %s, TokenContent: ", TokenType_ToString(token->type));
|
|
_TokenContent_Print(token);
|
|
printf(" , contex: row: %i, col: %i\n", token->context.row, token->context.col);
|
|
}
|
|
|
|
|