556 lines
21 KiB
C
556 lines
21 KiB
C
//
|
|
// Created by Natuie on 2025/3/22.
|
|
//
|
|
|
|
#include <ctype.h>
|
|
#include <string.h>
|
|
#include <stdio.h>
|
|
#include <malloc.h>
|
|
#include "lexer.h"
|
|
|
|
void lexer_init(Lexer *lexer, char *source) {
|
|
lexer->source = source;
|
|
lexer->current_pos = 0;
|
|
lexer->line = 1;
|
|
lexer->column = 1;
|
|
}
|
|
|
|
void lexer_free(Lexer *lexer) {
|
|
free(lexer->source);
|
|
}
|
|
|
|
// 跳过空白字符
|
|
void skip_whitespace_and_comments(Lexer *lexer) {
|
|
while (1) {
|
|
char c = lexer->source[lexer->current_pos];
|
|
if (c == ' ' || c == '\t' || c == '\r') {
|
|
lexer->current_pos++;
|
|
lexer->column++;
|
|
} else if (c == '\n') {
|
|
lexer->current_pos++;
|
|
lexer->line++;
|
|
lexer->column = 1;
|
|
} else if (c == '/' && lexer->source[lexer->current_pos+1] == '/') {
|
|
lexer->current_pos += 2; // 跳过"//"
|
|
lexer->column += 2;
|
|
while (lexer->source[lexer->current_pos] != '\n' &&
|
|
lexer->source[lexer->current_pos] != '\0') {
|
|
lexer->current_pos++;
|
|
lexer->column++;
|
|
}
|
|
// 处理换行符
|
|
if (lexer->source[lexer->current_pos] == '\n') {
|
|
lexer->current_pos++;
|
|
lexer->line++;
|
|
lexer->column = 1;
|
|
} else {
|
|
// 文件末尾
|
|
break;
|
|
}
|
|
} else if (c == '/' && lexer->source[lexer->current_pos+1] == '*') {
|
|
lexer->current_pos += 2; // 跳过"/*"
|
|
lexer->column += 2;
|
|
int in_comment = 1;
|
|
while (in_comment && lexer->source[lexer->current_pos] != '\0') {
|
|
c = lexer->source[lexer->current_pos];
|
|
if (c == '\n') {
|
|
lexer->line++;
|
|
lexer->column = 1;
|
|
} else if (c == '*' && lexer->source[lexer->current_pos+1] == '/') {
|
|
// 结束注释
|
|
lexer->current_pos += 2;
|
|
lexer->column += 2;
|
|
in_comment = 0;
|
|
} else {
|
|
lexer->column++;
|
|
}
|
|
lexer->current_pos++;
|
|
}
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// 读取标识符
|
|
Token read_identifier(Lexer *lexer) {
|
|
Token token;
|
|
int pos = 0;
|
|
token.type = TOKEN_IDENTIFIER;
|
|
token.line = lexer->line;
|
|
token.column = lexer->column;
|
|
|
|
while (isalnum(lexer->source[lexer->current_pos]) ||
|
|
(lexer->source[lexer->current_pos] == '_')) {
|
|
if (pos < 255) {
|
|
token.value[pos++] = lexer->source[lexer->current_pos];
|
|
}
|
|
lexer->current_pos++;
|
|
lexer->column++;
|
|
}
|
|
token.value[pos] = '\0';
|
|
|
|
// 关键字识别
|
|
if (strcmp(token.value, "func") == 0) token.type = TOKEN_FUNC;
|
|
else if (strcmp(token.value, "let") == 0) token.type = TOKEN_LET;
|
|
else if (strcmp(token.value, "const") == 0) token.type = TOKEN_CONST;
|
|
else if (strcmp(token.value, "i8") == 0) token.type = TOKEN_TYPE_I8;
|
|
else if (strcmp(token.value, "i16") == 0) token.type = TOKEN_TYPE_I16;
|
|
else if (strcmp(token.value, "i32") == 0) token.type = TOKEN_TYPE_I32;
|
|
else if (strcmp(token.value, "i64") == 0) token.type = TOKEN_TYPE_I64;
|
|
else if (strcmp(token.value, "u8") == 0) token.type = TOKEN_TYPE_U8;
|
|
else if (strcmp(token.value, "u16") == 0) token.type = TOKEN_TYPE_U16;
|
|
else if (strcmp(token.value, "u32") == 0) token.type = TOKEN_TYPE_U32;
|
|
else if (strcmp(token.value, "u64") == 0) token.type = TOKEN_TYPE_U64;
|
|
else if (strcmp(token.value, "f32") == 0) token.type = TOKEN_TYPE_F32;
|
|
else if (strcmp(token.value, "f64") == 0) token.type = TOKEN_TYPE_F64;
|
|
else if (strcmp(token.value, "void") == 0) token.type = TOKEN_TYPE_VOID;
|
|
else if (strcmp(token.value, "any") == 0) token.type = TOKEN_TYPE_ANY;
|
|
else if (strcmp(token.value, "int") == 0) token.type = TOKEN_INT;
|
|
else if (strcmp(token.value, "float") == 0) token.type = TOKEN_FLOAT;
|
|
else if (strcmp(token.value, "string") == 0) token.type = TOKEN_STRING;
|
|
else if (strcmp(token.value, "bool") == 0) token.type = TOKEN_BOOL;
|
|
else if (strcmp(token.value, "array") == 0) token.type = TOKEN_ARRAY;
|
|
else if (strcmp(token.value, "map") == 0) token.type = TOKEN_MAP;
|
|
else if (strcmp(token.value, "true") == 0) token.type = TOKEN_TRUE;
|
|
else if (strcmp(token.value, "false") == 0) token.type = TOKEN_FALSE;
|
|
else if (strcmp(token.value, "if") == 0) token.type = TOKEN_IF;
|
|
else if (strcmp(token.value, "else") == 0) token.type = TOKEN_ELSE;
|
|
else if (strcmp(token.value, "while") == 0) token.type = TOKEN_WHILE;
|
|
else if (strcmp(token.value, "for") == 0) token.type = TOKEN_FOR;
|
|
else if (strcmp(token.value, "break") == 0) token.type = TOKEN_BREAK;
|
|
else if (strcmp(token.value, "continue") == 0) token.type = TOKEN_CONTINUE;
|
|
else if (strcmp(token.value, "return") == 0) token.type = TOKEN_RETURN;
|
|
else if (strcmp(token.value, "import") == 0) token.type = TOKEN_IMPORT;
|
|
else if (strcmp(token.value, "as") == 0) token.type = TOKEN_AS;
|
|
else if (strcmp(token.value, "and") == 0) token.type = TOKEN_AND;
|
|
else if (strcmp(token.value, "or") == 0) token.type = TOKEN_OR;
|
|
else if (strcmp(token.value, "in") == 0) token.type = TOKEN_IN;
|
|
else if (strcmp(token.value, "native") == 0) token.type = TOKEN_NATIVE;
|
|
else if (strcmp(token.value, "try") == 0) token.type = TOKEN_TRY;
|
|
else if (strcmp(token.value, "catch") == 0) token.type = TOKEN_CATCH;
|
|
else if (strcmp(token.value, "throw") == 0) token.type = TOKEN_THROW;
|
|
else token.type = TOKEN_IDENTIFIER;
|
|
return token;
|
|
}
|
|
|
|
// 读取数字
|
|
Token read_number(Lexer *lexer) {
|
|
Token token;
|
|
int pos = 0;
|
|
int has_dot = 0;
|
|
int has_exp = 0;
|
|
int base = 10; // 默认十进制
|
|
token.line = lexer->line;
|
|
token.column = lexer->column;
|
|
|
|
// 检查是否是特殊进制数
|
|
if (lexer->source[lexer->current_pos] == '0') {
|
|
char next = lexer->source[lexer->current_pos + 1];
|
|
if (next == 'b' || next == 'B') { // 二进制
|
|
base = 2;
|
|
token.value[pos++] = '0';
|
|
token.value[pos++] = next;
|
|
lexer->current_pos += 2;
|
|
lexer->column += 2;
|
|
// 读取二进制数字
|
|
while (lexer->source[lexer->current_pos] == '0' ||
|
|
lexer->source[lexer->current_pos] == '1') {
|
|
if (pos < 255) {
|
|
token.value[pos++] = lexer->source[lexer->current_pos];
|
|
}
|
|
lexer->current_pos++;
|
|
lexer->column++;
|
|
}
|
|
if (pos <= 2) { // 只有前缀没有数字
|
|
fprintf(stderr, "Error: Invalid binary number at line %d, column %d\n",
|
|
lexer->line, lexer->column);
|
|
token.type = TOKEN_ERROR;
|
|
return token;
|
|
}
|
|
} else if (next == 'x' || next == 'X') { // 十六进制
|
|
base = 16;
|
|
token.value[pos++] = '0';
|
|
token.value[pos++] = next;
|
|
lexer->current_pos += 2;
|
|
lexer->column += 2;
|
|
// 读取十六进制数字
|
|
while (isxdigit(lexer->source[lexer->current_pos])) {
|
|
if (pos < 255) {
|
|
token.value[pos++] = lexer->source[lexer->current_pos];
|
|
}
|
|
lexer->current_pos++;
|
|
lexer->column++;
|
|
}
|
|
if (pos <= 2) { // 只有前缀没有数字
|
|
fprintf(stderr, "Error: Invalid hexadecimal number at line %d, column %d\n",
|
|
lexer->line, lexer->column);
|
|
token.type = TOKEN_ERROR;
|
|
return token;
|
|
}
|
|
} else { // 八进制
|
|
base = 8;
|
|
token.value[pos++] = '0';
|
|
lexer->current_pos++; // 跳过0
|
|
lexer->column++;
|
|
// 读取后续的八进制数字
|
|
while (lexer->source[lexer->current_pos] >= '0' &&
|
|
lexer->source[lexer->current_pos] <= '7') {
|
|
if (pos < 255) {
|
|
token.value[pos++] = lexer->source[lexer->current_pos];
|
|
}
|
|
lexer->current_pos++;
|
|
lexer->column++;
|
|
}
|
|
// 检查是否后续字符是数字但超出八进制范围
|
|
if (isdigit(lexer->source[lexer->current_pos])) {
|
|
fprintf(stderr, "Error: Invalid octal number at line %d, column %d\n",
|
|
lexer->line, lexer->column);
|
|
token.type = TOKEN_ERROR;
|
|
return token;
|
|
}
|
|
}
|
|
}
|
|
|
|
// 如果不是特殊进制,按照十进制处理
|
|
if (base == 10) {
|
|
// 读取整数部分
|
|
while (isdigit(lexer->source[lexer->current_pos])) {
|
|
if (pos < 255) {
|
|
token.value[pos++] = lexer->source[lexer->current_pos];
|
|
}
|
|
lexer->current_pos++;
|
|
lexer->column++;
|
|
}
|
|
|
|
// 处理小数点和小数部分
|
|
if (lexer->source[lexer->current_pos] == '.') {
|
|
has_dot = 1;
|
|
if (pos < 255) {
|
|
token.value[pos++] = lexer->source[lexer->current_pos];
|
|
}
|
|
lexer->current_pos++;
|
|
lexer->column++;
|
|
|
|
// 检查小数点后是否有数字
|
|
if (!isdigit(lexer->source[lexer->current_pos])) {
|
|
fprintf(stderr, "Error: Expected digit after decimal point at line %d, column %d\n",
|
|
lexer->line, lexer->column);
|
|
token.type = TOKEN_ERROR;
|
|
return token;
|
|
}
|
|
|
|
// 读取小数部分
|
|
while (isdigit(lexer->source[lexer->current_pos])) {
|
|
if (pos < 255) {
|
|
token.value[pos++] = lexer->source[lexer->current_pos];
|
|
}
|
|
lexer->current_pos++;
|
|
lexer->column++;
|
|
}
|
|
}
|
|
|
|
// 处理科学计数法
|
|
if (lexer->source[lexer->current_pos] == 'e' || lexer->source[lexer->current_pos] == 'E') {
|
|
has_exp = 1;
|
|
if (pos < 255) {
|
|
token.value[pos++] = lexer->source[lexer->current_pos];
|
|
}
|
|
lexer->current_pos++;
|
|
lexer->column++;
|
|
|
|
// 处理指数的符号
|
|
if (lexer->source[lexer->current_pos] == '+' || lexer->source[lexer->current_pos] == '-') {
|
|
if (pos < 255) {
|
|
token.value[pos++] = lexer->source[lexer->current_pos];
|
|
}
|
|
lexer->current_pos++;
|
|
lexer->column++;
|
|
}
|
|
|
|
// 读取指数部分
|
|
if (!isdigit(lexer->source[lexer->current_pos])) {
|
|
// 错误处理:科学计数法后面必须有数字
|
|
fprintf(stderr, "Error: Invalid scientific notation at line %d, column %d\n",
|
|
lexer->line, lexer->column);
|
|
token.type = TOKEN_ERROR;
|
|
return token;
|
|
}
|
|
|
|
while (isdigit(lexer->source[lexer->current_pos])) {
|
|
if (pos < 255) {
|
|
token.value[pos++] = lexer->source[lexer->current_pos];
|
|
}
|
|
lexer->current_pos++;
|
|
lexer->column++;
|
|
}
|
|
}
|
|
}
|
|
token.value[pos] = '\0';
|
|
token.type = (has_dot || has_exp) ? TOKEN_FLOAT_LITERAL : TOKEN_INT_LITERAL;
|
|
return token;
|
|
}
|
|
|
|
// 读取字符串
|
|
Token read_string(Lexer *lexer) {
|
|
Token token;
|
|
int pos = 0;
|
|
token.type = TOKEN_STRING_LITERAL;
|
|
token.line = lexer->line;
|
|
token.column = lexer->column;
|
|
|
|
lexer->current_pos++; // 跳过引号
|
|
lexer->column++;
|
|
|
|
while (lexer->source[lexer->current_pos] != '"') {
|
|
if (lexer->source[lexer->current_pos] == '\0') {
|
|
token.type = TOKEN_ERROR;
|
|
strcpy(token.value, "Unterminated string literal");
|
|
return token;
|
|
}
|
|
|
|
if (lexer->source[lexer->current_pos] == '\\') {
|
|
lexer->current_pos++;
|
|
lexer->column++;
|
|
|
|
if (pos >= 255) continue;
|
|
|
|
switch (lexer->source[lexer->current_pos]) {
|
|
case 'n': token.value[pos++] = '\n'; break;
|
|
case 't': token.value[pos++] = '\t'; break;
|
|
case 'r': token.value[pos++] = '\r'; break;
|
|
case '\"': token.value[pos++] = '\"'; break;
|
|
case '\\': token.value[pos++] = '\\'; break;
|
|
default:
|
|
fprintf(stderr, "Error: Invalid escape sequence '\\%c' at line %d, column %d\n",
|
|
lexer->source[lexer->current_pos], lexer->line, lexer->column);
|
|
token.type = TOKEN_ERROR;
|
|
return token;
|
|
}
|
|
} else if (pos < 255) {
|
|
token.value[pos++] = lexer->source[lexer->current_pos];
|
|
}
|
|
lexer->current_pos++;
|
|
lexer->column++;
|
|
}
|
|
token.value[pos] = '\0';
|
|
if (lexer->source[lexer->current_pos] == '"') {
|
|
lexer->current_pos++;
|
|
lexer->column++;
|
|
} else {
|
|
token.type = TOKEN_ERROR;
|
|
strcpy(token.value, "Unterminated string literal");
|
|
}
|
|
return token;
|
|
}
|
|
|
|
Token lexer_next_token(Lexer *lexer) {
|
|
skip_whitespace_and_comments(lexer);
|
|
char c = lexer->source[lexer->current_pos];
|
|
|
|
Token token;
|
|
token.line = lexer->line;
|
|
token.column = lexer->column;
|
|
token.value[0] = '\0';
|
|
|
|
if (c == '\0') {
|
|
token.type = TOKEN_EOF;
|
|
return token;
|
|
} else if (isalpha(c) || (c == '_')) {
|
|
return read_identifier(lexer);
|
|
} else if (isdigit(c)) {
|
|
return read_number(lexer);
|
|
} else if (c == '"') {
|
|
return read_string(lexer);
|
|
} else {
|
|
token.value[0] = c;
|
|
token.value[1] = '\0';
|
|
lexer->current_pos++;
|
|
lexer->column++;
|
|
|
|
// 处理双字符和三字符操作符
|
|
char next_c = lexer->source[lexer->current_pos];
|
|
char next_next_c = (next_c != '\0') ? lexer->source[lexer->current_pos + 1] : '\0';
|
|
|
|
// 处理三字符操作符
|
|
if (c == '<' && next_c == '<' && next_next_c == '=') {
|
|
token.value[1] = '<';
|
|
token.value[2] = '=';
|
|
token.value[3] = '\0';
|
|
lexer->current_pos += 2;
|
|
lexer->column += 2;
|
|
token.type = TOKEN_SHL_ASSIGN;
|
|
return token;
|
|
} else if (c == '>' && next_c == '>' && next_next_c == '=') {
|
|
token.value[1] = '>';
|
|
token.value[2] = '=';
|
|
token.value[3] = '\0';
|
|
lexer->current_pos += 2;
|
|
lexer->column += 2;
|
|
token.type = TOKEN_SHR_ASSIGN;
|
|
return token;
|
|
}
|
|
|
|
// 处理双字符操作符
|
|
if (c == '=' && next_c == '=') {
|
|
token.value[1] = '=';
|
|
token.value[2] = '\0';
|
|
lexer->current_pos++;
|
|
lexer->column++;
|
|
token.type = TOKEN_EQ_EQ;
|
|
return token;
|
|
} else if (c == '!' && next_c == '=') {
|
|
token.value[1] = '=';
|
|
token.value[2] = '\0';
|
|
lexer->current_pos++;
|
|
lexer->column++;
|
|
token.type = TOKEN_NE;
|
|
return token;
|
|
} else if (c == '<' && next_c == '=') {
|
|
token.value[1] = '=';
|
|
token.value[2] = '\0';
|
|
lexer->current_pos++;
|
|
lexer->column++;
|
|
token.type = TOKEN_LE;
|
|
return token;
|
|
} else if (c == '>' && next_c == '=') {
|
|
token.value[1] = '=';
|
|
token.value[2] = '\0';
|
|
lexer->current_pos++;
|
|
lexer->column++;
|
|
token.type = TOKEN_GE;
|
|
return token;
|
|
} else if (c == '+' && next_c == '=') {
|
|
token.value[1] = '=';
|
|
token.value[2] = '\0';
|
|
lexer->current_pos++;
|
|
lexer->column++;
|
|
token.type = TOKEN_PLUS_ASSIGN;
|
|
return token;
|
|
} else if (c == '-' && next_c == '=') {
|
|
token.value[1] = '=';
|
|
token.value[2] = '\0';
|
|
lexer->current_pos++;
|
|
lexer->column++;
|
|
token.type = TOKEN_MINUS_ASSIGN;
|
|
return token;
|
|
} else if (c == '*' && next_c == '=') {
|
|
token.value[1] = '=';
|
|
token.value[2] = '\0';
|
|
lexer->current_pos++;
|
|
lexer->column++;
|
|
token.type = TOKEN_MUL_ASSIGN;
|
|
return token;
|
|
} else if (c == '/' && next_c == '=') {
|
|
token.value[1] = '=';
|
|
token.value[2] = '\0';
|
|
lexer->current_pos++;
|
|
lexer->column++;
|
|
token.type = TOKEN_DIV_ASSIGN;
|
|
return token;
|
|
} else if (c == '%' && next_c == '=') {
|
|
token.value[1] = '=';
|
|
token.value[2] = '\0';
|
|
lexer->current_pos++;
|
|
lexer->column++;
|
|
token.type = TOKEN_MOD_ASSIGN;
|
|
return token;
|
|
} else if (c == '&' && next_c == '=') {
|
|
token.value[1] = '=';
|
|
token.value[2] = '\0';
|
|
lexer->current_pos++;
|
|
lexer->column++;
|
|
token.type = TOKEN_BITAND_ASSIGN;
|
|
return token;
|
|
} else if (c == '|' && next_c == '=') {
|
|
token.value[1] = '=';
|
|
token.value[2] = '\0';
|
|
lexer->current_pos++;
|
|
lexer->column++;
|
|
token.type = TOKEN_BITOR_ASSIGN;
|
|
return token;
|
|
} else if (c == '^' && next_c == '=') {
|
|
token.value[1] = '=';
|
|
token.value[2] = '\0';
|
|
lexer->current_pos++;
|
|
lexer->column++;
|
|
token.type = TOKEN_BITXOR_ASSIGN;
|
|
return token;
|
|
} else if (c == '<' && next_c == '<') {
|
|
token.value[1] = '<';
|
|
token.value[2] = '\0';
|
|
lexer->current_pos++;
|
|
lexer->column++;
|
|
token.type = TOKEN_SHL;
|
|
return token;
|
|
} else if (c == '>' && next_c == '>') {
|
|
token.value[1] = '>';
|
|
token.value[2] = '\0';
|
|
lexer->current_pos++;
|
|
lexer->column++;
|
|
token.type = TOKEN_SHR;
|
|
return token;
|
|
} else if (c == '+' && next_c == '+') {
|
|
token.value[1] = '+';
|
|
token.value[2] = '\0';
|
|
lexer->current_pos++;
|
|
lexer->column++;
|
|
token.type = TOKEN_PLUS_PLUS;
|
|
return token;
|
|
} else if (c == '-' && next_c == '-') {
|
|
token.value[1] = '-';
|
|
token.value[2] = '\0';
|
|
lexer->current_pos++;
|
|
lexer->column++;
|
|
token.type = TOKEN_MINUS_MINUS;
|
|
return token;
|
|
} else if (c == '&' && next_c == '&') {
|
|
token.value[1] = '&';
|
|
token.value[2] = '\0';
|
|
lexer->current_pos++;
|
|
lexer->column++;
|
|
token.type = TOKEN_AND;
|
|
return token;
|
|
} else if (c == '|' && next_c == '|') {
|
|
token.value[1] = '|';
|
|
token.value[2] = '\0';
|
|
lexer->current_pos++;
|
|
lexer->column++;
|
|
token.type = TOKEN_OR;
|
|
return token;
|
|
}
|
|
|
|
switch (c) {
|
|
case ':': token.type = TOKEN_COLON; break;
|
|
case '=': token.type = TOKEN_EQ; break;
|
|
case '(': token.type = TOKEN_LPAREN; break;
|
|
case ')': token.type = TOKEN_RPAREN; break;
|
|
case '{': token.type = TOKEN_LBRACE; break;
|
|
case '}': token.type = TOKEN_RBRACE; break;
|
|
case '[': token.type = TOKEN_LBRACKET; break;
|
|
case ']': token.type = TOKEN_RBRACKET; break;
|
|
case ',': token.type = TOKEN_COMMA; break;
|
|
case ';': token.type = TOKEN_SEMICOLON; break;
|
|
case '+': token.type = TOKEN_PLUS; break;
|
|
case '-': token.type = TOKEN_MINUS; break;
|
|
case '*': token.type = TOKEN_STAR; break;
|
|
case '/': token.type = TOKEN_SLASH; break;
|
|
case '.': token.type = TOKEN_DOT; break;
|
|
case '<': token.type = TOKEN_LT; break;
|
|
case '>': token.type = TOKEN_GT; break;
|
|
case '!': token.type = TOKEN_BANG; break;
|
|
case '|': token.type = TOKEN_BITOR; break;
|
|
case '&': token.type = TOKEN_BITAND; break;
|
|
case '^': token.type = TOKEN_BITXOR; break;
|
|
case '%': token.type = TOKEN_MOD; break;
|
|
case '@': token.type = TOKEN_AT; break;
|
|
case '~': token.type = TOKEN_TILDE; break;
|
|
case '?': token.type = TOKEN_QUESTION; break;
|
|
|
|
default:
|
|
token.type = TOKEN_ERROR;
|
|
sprintf(token.value, "Unexpected character: %c", c);
|
|
}
|
|
return token;
|
|
}
|
|
} |