vety-language/parser/lexer.c

556 lines
21 KiB
C

//
// Created by Natuie on 2025/3/22.
//
#include <ctype.h>
#include <string.h>
#include <stdio.h>
#include <malloc.h>
#include "lexer.h"
void lexer_init(Lexer *lexer, char *source) {
lexer->source = source;
lexer->current_pos = 0;
lexer->line = 1;
lexer->column = 1;
}
void lexer_free(Lexer *lexer) {
free(lexer->source);
}
// 跳过空白字符
void skip_whitespace_and_comments(Lexer *lexer) {
while (1) {
char c = lexer->source[lexer->current_pos];
if (c == ' ' || c == '\t' || c == '\r') {
lexer->current_pos++;
lexer->column++;
} else if (c == '\n') {
lexer->current_pos++;
lexer->line++;
lexer->column = 1;
} else if (c == '/' && lexer->source[lexer->current_pos+1] == '/') {
lexer->current_pos += 2; // 跳过"//"
lexer->column += 2;
while (lexer->source[lexer->current_pos] != '\n' &&
lexer->source[lexer->current_pos] != '\0') {
lexer->current_pos++;
lexer->column++;
}
// 处理换行符
if (lexer->source[lexer->current_pos] == '\n') {
lexer->current_pos++;
lexer->line++;
lexer->column = 1;
} else {
// 文件末尾
break;
}
} else if (c == '/' && lexer->source[lexer->current_pos+1] == '*') {
lexer->current_pos += 2; // 跳过"/*"
lexer->column += 2;
int in_comment = 1;
while (in_comment && lexer->source[lexer->current_pos] != '\0') {
c = lexer->source[lexer->current_pos];
if (c == '\n') {
lexer->line++;
lexer->column = 1;
} else if (c == '*' && lexer->source[lexer->current_pos+1] == '/') {
// 结束注释
lexer->current_pos += 2;
lexer->column += 2;
in_comment = 0;
} else {
lexer->column++;
}
lexer->current_pos++;
}
} else {
break;
}
}
}
// 读取标识符
Token read_identifier(Lexer *lexer) {
Token token;
int pos = 0;
token.type = TOKEN_IDENTIFIER;
token.line = lexer->line;
token.column = lexer->column;
while (isalnum(lexer->source[lexer->current_pos]) ||
(lexer->source[lexer->current_pos] == '_')) {
if (pos < 255) {
token.value[pos++] = lexer->source[lexer->current_pos];
}
lexer->current_pos++;
lexer->column++;
}
token.value[pos] = '\0';
// 关键字识别
if (strcmp(token.value, "func") == 0) token.type = TOKEN_FUNC;
else if (strcmp(token.value, "let") == 0) token.type = TOKEN_LET;
else if (strcmp(token.value, "const") == 0) token.type = TOKEN_CONST;
else if (strcmp(token.value, "i8") == 0) token.type = TOKEN_TYPE_I8;
else if (strcmp(token.value, "i16") == 0) token.type = TOKEN_TYPE_I16;
else if (strcmp(token.value, "i32") == 0) token.type = TOKEN_TYPE_I32;
else if (strcmp(token.value, "i64") == 0) token.type = TOKEN_TYPE_I64;
else if (strcmp(token.value, "u8") == 0) token.type = TOKEN_TYPE_U8;
else if (strcmp(token.value, "u16") == 0) token.type = TOKEN_TYPE_U16;
else if (strcmp(token.value, "u32") == 0) token.type = TOKEN_TYPE_U32;
else if (strcmp(token.value, "u64") == 0) token.type = TOKEN_TYPE_U64;
else if (strcmp(token.value, "f32") == 0) token.type = TOKEN_TYPE_F32;
else if (strcmp(token.value, "f64") == 0) token.type = TOKEN_TYPE_F64;
else if (strcmp(token.value, "void") == 0) token.type = TOKEN_TYPE_VOID;
else if (strcmp(token.value, "any") == 0) token.type = TOKEN_TYPE_ANY;
else if (strcmp(token.value, "int") == 0) token.type = TOKEN_INT;
else if (strcmp(token.value, "float") == 0) token.type = TOKEN_FLOAT;
else if (strcmp(token.value, "string") == 0) token.type = TOKEN_STRING;
else if (strcmp(token.value, "bool") == 0) token.type = TOKEN_BOOL;
else if (strcmp(token.value, "array") == 0) token.type = TOKEN_ARRAY;
else if (strcmp(token.value, "map") == 0) token.type = TOKEN_MAP;
else if (strcmp(token.value, "true") == 0) token.type = TOKEN_TRUE;
else if (strcmp(token.value, "false") == 0) token.type = TOKEN_FALSE;
else if (strcmp(token.value, "if") == 0) token.type = TOKEN_IF;
else if (strcmp(token.value, "else") == 0) token.type = TOKEN_ELSE;
else if (strcmp(token.value, "while") == 0) token.type = TOKEN_WHILE;
else if (strcmp(token.value, "for") == 0) token.type = TOKEN_FOR;
else if (strcmp(token.value, "break") == 0) token.type = TOKEN_BREAK;
else if (strcmp(token.value, "continue") == 0) token.type = TOKEN_CONTINUE;
else if (strcmp(token.value, "return") == 0) token.type = TOKEN_RETURN;
else if (strcmp(token.value, "import") == 0) token.type = TOKEN_IMPORT;
else if (strcmp(token.value, "as") == 0) token.type = TOKEN_AS;
else if (strcmp(token.value, "and") == 0) token.type = TOKEN_AND;
else if (strcmp(token.value, "or") == 0) token.type = TOKEN_OR;
else if (strcmp(token.value, "in") == 0) token.type = TOKEN_IN;
else if (strcmp(token.value, "native") == 0) token.type = TOKEN_NATIVE;
else if (strcmp(token.value, "try") == 0) token.type = TOKEN_TRY;
else if (strcmp(token.value, "catch") == 0) token.type = TOKEN_CATCH;
else if (strcmp(token.value, "throw") == 0) token.type = TOKEN_THROW;
else token.type = TOKEN_IDENTIFIER;
return token;
}
// 读取数字
Token read_number(Lexer *lexer) {
Token token;
int pos = 0;
int has_dot = 0;
int has_exp = 0;
int base = 10; // 默认十进制
token.line = lexer->line;
token.column = lexer->column;
// 检查是否是特殊进制数
if (lexer->source[lexer->current_pos] == '0') {
char next = lexer->source[lexer->current_pos + 1];
if (next == 'b' || next == 'B') { // 二进制
base = 2;
token.value[pos++] = '0';
token.value[pos++] = next;
lexer->current_pos += 2;
lexer->column += 2;
// 读取二进制数字
while (lexer->source[lexer->current_pos] == '0' ||
lexer->source[lexer->current_pos] == '1') {
if (pos < 255) {
token.value[pos++] = lexer->source[lexer->current_pos];
}
lexer->current_pos++;
lexer->column++;
}
if (pos <= 2) { // 只有前缀没有数字
fprintf(stderr, "Error: Invalid binary number at line %d, column %d\n",
lexer->line, lexer->column);
token.type = TOKEN_ERROR;
return token;
}
} else if (next == 'x' || next == 'X') { // 十六进制
base = 16;
token.value[pos++] = '0';
token.value[pos++] = next;
lexer->current_pos += 2;
lexer->column += 2;
// 读取十六进制数字
while (isxdigit(lexer->source[lexer->current_pos])) {
if (pos < 255) {
token.value[pos++] = lexer->source[lexer->current_pos];
}
lexer->current_pos++;
lexer->column++;
}
if (pos <= 2) { // 只有前缀没有数字
fprintf(stderr, "Error: Invalid hexadecimal number at line %d, column %d\n",
lexer->line, lexer->column);
token.type = TOKEN_ERROR;
return token;
}
} else { // 八进制
base = 8;
token.value[pos++] = '0';
lexer->current_pos++; // 跳过0
lexer->column++;
// 读取后续的八进制数字
while (lexer->source[lexer->current_pos] >= '0' &&
lexer->source[lexer->current_pos] <= '7') {
if (pos < 255) {
token.value[pos++] = lexer->source[lexer->current_pos];
}
lexer->current_pos++;
lexer->column++;
}
// 检查是否后续字符是数字但超出八进制范围
if (isdigit(lexer->source[lexer->current_pos])) {
fprintf(stderr, "Error: Invalid octal number at line %d, column %d\n",
lexer->line, lexer->column);
token.type = TOKEN_ERROR;
return token;
}
}
}
// 如果不是特殊进制,按照十进制处理
if (base == 10) {
// 读取整数部分
while (isdigit(lexer->source[lexer->current_pos])) {
if (pos < 255) {
token.value[pos++] = lexer->source[lexer->current_pos];
}
lexer->current_pos++;
lexer->column++;
}
// 处理小数点和小数部分
if (lexer->source[lexer->current_pos] == '.') {
has_dot = 1;
if (pos < 255) {
token.value[pos++] = lexer->source[lexer->current_pos];
}
lexer->current_pos++;
lexer->column++;
// 检查小数点后是否有数字
if (!isdigit(lexer->source[lexer->current_pos])) {
fprintf(stderr, "Error: Expected digit after decimal point at line %d, column %d\n",
lexer->line, lexer->column);
token.type = TOKEN_ERROR;
return token;
}
// 读取小数部分
while (isdigit(lexer->source[lexer->current_pos])) {
if (pos < 255) {
token.value[pos++] = lexer->source[lexer->current_pos];
}
lexer->current_pos++;
lexer->column++;
}
}
// 处理科学计数法
if (lexer->source[lexer->current_pos] == 'e' || lexer->source[lexer->current_pos] == 'E') {
has_exp = 1;
if (pos < 255) {
token.value[pos++] = lexer->source[lexer->current_pos];
}
lexer->current_pos++;
lexer->column++;
// 处理指数的符号
if (lexer->source[lexer->current_pos] == '+' || lexer->source[lexer->current_pos] == '-') {
if (pos < 255) {
token.value[pos++] = lexer->source[lexer->current_pos];
}
lexer->current_pos++;
lexer->column++;
}
// 读取指数部分
if (!isdigit(lexer->source[lexer->current_pos])) {
// 错误处理:科学计数法后面必须有数字
fprintf(stderr, "Error: Invalid scientific notation at line %d, column %d\n",
lexer->line, lexer->column);
token.type = TOKEN_ERROR;
return token;
}
while (isdigit(lexer->source[lexer->current_pos])) {
if (pos < 255) {
token.value[pos++] = lexer->source[lexer->current_pos];
}
lexer->current_pos++;
lexer->column++;
}
}
}
token.value[pos] = '\0';
token.type = (has_dot || has_exp) ? TOKEN_FLOAT_LITERAL : TOKEN_INT_LITERAL;
return token;
}
// 读取字符串
Token read_string(Lexer *lexer) {
Token token;
int pos = 0;
token.type = TOKEN_STRING_LITERAL;
token.line = lexer->line;
token.column = lexer->column;
lexer->current_pos++; // 跳过引号
lexer->column++;
while (lexer->source[lexer->current_pos] != '"') {
if (lexer->source[lexer->current_pos] == '\0') {
token.type = TOKEN_ERROR;
strcpy(token.value, "Unterminated string literal");
return token;
}
if (lexer->source[lexer->current_pos] == '\\') {
lexer->current_pos++;
lexer->column++;
if (pos >= 255) continue;
switch (lexer->source[lexer->current_pos]) {
case 'n': token.value[pos++] = '\n'; break;
case 't': token.value[pos++] = '\t'; break;
case 'r': token.value[pos++] = '\r'; break;
case '\"': token.value[pos++] = '\"'; break;
case '\\': token.value[pos++] = '\\'; break;
default:
fprintf(stderr, "Error: Invalid escape sequence '\\%c' at line %d, column %d\n",
lexer->source[lexer->current_pos], lexer->line, lexer->column);
token.type = TOKEN_ERROR;
return token;
}
} else if (pos < 255) {
token.value[pos++] = lexer->source[lexer->current_pos];
}
lexer->current_pos++;
lexer->column++;
}
token.value[pos] = '\0';
if (lexer->source[lexer->current_pos] == '"') {
lexer->current_pos++;
lexer->column++;
} else {
token.type = TOKEN_ERROR;
strcpy(token.value, "Unterminated string literal");
}
return token;
}
Token lexer_next_token(Lexer *lexer) {
skip_whitespace_and_comments(lexer);
char c = lexer->source[lexer->current_pos];
Token token;
token.line = lexer->line;
token.column = lexer->column;
token.value[0] = '\0';
if (c == '\0') {
token.type = TOKEN_EOF;
return token;
} else if (isalpha(c) || (c == '_')) {
return read_identifier(lexer);
} else if (isdigit(c)) {
return read_number(lexer);
} else if (c == '"') {
return read_string(lexer);
} else {
token.value[0] = c;
token.value[1] = '\0';
lexer->current_pos++;
lexer->column++;
// 处理双字符和三字符操作符
char next_c = lexer->source[lexer->current_pos];
char next_next_c = (next_c != '\0') ? lexer->source[lexer->current_pos + 1] : '\0';
// 处理三字符操作符
if (c == '<' && next_c == '<' && next_next_c == '=') {
token.value[1] = '<';
token.value[2] = '=';
token.value[3] = '\0';
lexer->current_pos += 2;
lexer->column += 2;
token.type = TOKEN_SHL_ASSIGN;
return token;
} else if (c == '>' && next_c == '>' && next_next_c == '=') {
token.value[1] = '>';
token.value[2] = '=';
token.value[3] = '\0';
lexer->current_pos += 2;
lexer->column += 2;
token.type = TOKEN_SHR_ASSIGN;
return token;
}
// 处理双字符操作符
if (c == '=' && next_c == '=') {
token.value[1] = '=';
token.value[2] = '\0';
lexer->current_pos++;
lexer->column++;
token.type = TOKEN_EQ_EQ;
return token;
} else if (c == '!' && next_c == '=') {
token.value[1] = '=';
token.value[2] = '\0';
lexer->current_pos++;
lexer->column++;
token.type = TOKEN_NE;
return token;
} else if (c == '<' && next_c == '=') {
token.value[1] = '=';
token.value[2] = '\0';
lexer->current_pos++;
lexer->column++;
token.type = TOKEN_LE;
return token;
} else if (c == '>' && next_c == '=') {
token.value[1] = '=';
token.value[2] = '\0';
lexer->current_pos++;
lexer->column++;
token.type = TOKEN_GE;
return token;
} else if (c == '+' && next_c == '=') {
token.value[1] = '=';
token.value[2] = '\0';
lexer->current_pos++;
lexer->column++;
token.type = TOKEN_PLUS_ASSIGN;
return token;
} else if (c == '-' && next_c == '=') {
token.value[1] = '=';
token.value[2] = '\0';
lexer->current_pos++;
lexer->column++;
token.type = TOKEN_MINUS_ASSIGN;
return token;
} else if (c == '*' && next_c == '=') {
token.value[1] = '=';
token.value[2] = '\0';
lexer->current_pos++;
lexer->column++;
token.type = TOKEN_MUL_ASSIGN;
return token;
} else if (c == '/' && next_c == '=') {
token.value[1] = '=';
token.value[2] = '\0';
lexer->current_pos++;
lexer->column++;
token.type = TOKEN_DIV_ASSIGN;
return token;
} else if (c == '%' && next_c == '=') {
token.value[1] = '=';
token.value[2] = '\0';
lexer->current_pos++;
lexer->column++;
token.type = TOKEN_MOD_ASSIGN;
return token;
} else if (c == '&' && next_c == '=') {
token.value[1] = '=';
token.value[2] = '\0';
lexer->current_pos++;
lexer->column++;
token.type = TOKEN_BITAND_ASSIGN;
return token;
} else if (c == '|' && next_c == '=') {
token.value[1] = '=';
token.value[2] = '\0';
lexer->current_pos++;
lexer->column++;
token.type = TOKEN_BITOR_ASSIGN;
return token;
} else if (c == '^' && next_c == '=') {
token.value[1] = '=';
token.value[2] = '\0';
lexer->current_pos++;
lexer->column++;
token.type = TOKEN_BITXOR_ASSIGN;
return token;
} else if (c == '<' && next_c == '<') {
token.value[1] = '<';
token.value[2] = '\0';
lexer->current_pos++;
lexer->column++;
token.type = TOKEN_SHL;
return token;
} else if (c == '>' && next_c == '>') {
token.value[1] = '>';
token.value[2] = '\0';
lexer->current_pos++;
lexer->column++;
token.type = TOKEN_SHR;
return token;
} else if (c == '+' && next_c == '+') {
token.value[1] = '+';
token.value[2] = '\0';
lexer->current_pos++;
lexer->column++;
token.type = TOKEN_PLUS_PLUS;
return token;
} else if (c == '-' && next_c == '-') {
token.value[1] = '-';
token.value[2] = '\0';
lexer->current_pos++;
lexer->column++;
token.type = TOKEN_MINUS_MINUS;
return token;
} else if (c == '&' && next_c == '&') {
token.value[1] = '&';
token.value[2] = '\0';
lexer->current_pos++;
lexer->column++;
token.type = TOKEN_AND;
return token;
} else if (c == '|' && next_c == '|') {
token.value[1] = '|';
token.value[2] = '\0';
lexer->current_pos++;
lexer->column++;
token.type = TOKEN_OR;
return token;
}
switch (c) {
case ':': token.type = TOKEN_COLON; break;
case '=': token.type = TOKEN_EQ; break;
case '(': token.type = TOKEN_LPAREN; break;
case ')': token.type = TOKEN_RPAREN; break;
case '{': token.type = TOKEN_LBRACE; break;
case '}': token.type = TOKEN_RBRACE; break;
case '[': token.type = TOKEN_LBRACKET; break;
case ']': token.type = TOKEN_RBRACKET; break;
case ',': token.type = TOKEN_COMMA; break;
case ';': token.type = TOKEN_SEMICOLON; break;
case '+': token.type = TOKEN_PLUS; break;
case '-': token.type = TOKEN_MINUS; break;
case '*': token.type = TOKEN_STAR; break;
case '/': token.type = TOKEN_SLASH; break;
case '.': token.type = TOKEN_DOT; break;
case '<': token.type = TOKEN_LT; break;
case '>': token.type = TOKEN_GT; break;
case '!': token.type = TOKEN_BANG; break;
case '|': token.type = TOKEN_BITOR; break;
case '&': token.type = TOKEN_BITAND; break;
case '^': token.type = TOKEN_BITXOR; break;
case '%': token.type = TOKEN_MOD; break;
case '@': token.type = TOKEN_AT; break;
case '~': token.type = TOKEN_TILDE; break;
case '?': token.type = TOKEN_QUESTION; break;
default:
token.type = TOKEN_ERROR;
sprintf(token.value, "Unexpected character: %c", c);
}
return token;
}
}