// // Created by Natuie on 2025/3/22. // #include #include #include #include #include "lexer.h" void lexer_init(Lexer *lexer, char *source) { lexer->source = source; lexer->current_pos = 0; lexer->line = 1; lexer->column = 1; } void lexer_free(Lexer *lexer) { free(lexer->source); } // 跳过空白字符 void skip_whitespace_and_comments(Lexer *lexer) { while (1) { char c = lexer->source[lexer->current_pos]; if (c == ' ' || c == '\t' || c == '\r') { lexer->current_pos++; lexer->column++; } else if (c == '\n') { lexer->current_pos++; lexer->line++; lexer->column = 1; } else if (c == '/' && lexer->source[lexer->current_pos+1] == '/') { lexer->current_pos += 2; // 跳过"//" lexer->column += 2; while (lexer->source[lexer->current_pos] != '\n' && lexer->source[lexer->current_pos] != '\0') { lexer->current_pos++; lexer->column++; } // 处理换行符 if (lexer->source[lexer->current_pos] == '\n') { lexer->current_pos++; lexer->line++; lexer->column = 1; } else { // 文件末尾 break; } } else if (c == '/' && lexer->source[lexer->current_pos+1] == '*') { lexer->current_pos += 2; // 跳过"/*" lexer->column += 2; int in_comment = 1; while (in_comment && lexer->source[lexer->current_pos] != '\0') { c = lexer->source[lexer->current_pos]; if (c == '\n') { lexer->line++; lexer->column = 1; } else if (c == '*' && lexer->source[lexer->current_pos+1] == '/') { // 结束注释 lexer->current_pos += 2; lexer->column += 2; in_comment = 0; } else { lexer->column++; } lexer->current_pos++; } } else { break; } } } // 读取标识符 Token read_identifier(Lexer *lexer) { Token token; int pos = 0; token.type = TOKEN_IDENTIFIER; token.line = lexer->line; token.column = lexer->column; while (isalnum(lexer->source[lexer->current_pos]) || (lexer->source[lexer->current_pos] == '_')) { if (pos < 255) { token.value[pos++] = lexer->source[lexer->current_pos]; } lexer->current_pos++; lexer->column++; } token.value[pos] = '\0'; // 关键字识别 if (strcmp(token.value, "func") == 0) token.type = TOKEN_FUNC; else if (strcmp(token.value, "let") == 0) token.type = TOKEN_LET; else if (strcmp(token.value, "const") == 0) token.type = TOKEN_CONST; else if (strcmp(token.value, "i8") == 0) token.type = TOKEN_TYPE_I8; else if (strcmp(token.value, "i16") == 0) token.type = TOKEN_TYPE_I16; else if (strcmp(token.value, "i32") == 0) token.type = TOKEN_TYPE_I32; else if (strcmp(token.value, "i64") == 0) token.type = TOKEN_TYPE_I64; else if (strcmp(token.value, "u8") == 0) token.type = TOKEN_TYPE_U8; else if (strcmp(token.value, "u16") == 0) token.type = TOKEN_TYPE_U16; else if (strcmp(token.value, "u32") == 0) token.type = TOKEN_TYPE_U32; else if (strcmp(token.value, "u64") == 0) token.type = TOKEN_TYPE_U64; else if (strcmp(token.value, "f32") == 0) token.type = TOKEN_TYPE_F32; else if (strcmp(token.value, "f64") == 0) token.type = TOKEN_TYPE_F64; else if (strcmp(token.value, "void") == 0) token.type = TOKEN_TYPE_VOID; else if (strcmp(token.value, "any") == 0) token.type = TOKEN_TYPE_ANY; else if (strcmp(token.value, "int") == 0) token.type = TOKEN_INT; else if (strcmp(token.value, "float") == 0) token.type = TOKEN_FLOAT; else if (strcmp(token.value, "string") == 0) token.type = TOKEN_STRING; else if (strcmp(token.value, "bool") == 0) token.type = TOKEN_BOOL; else if (strcmp(token.value, "array") == 0) token.type = TOKEN_ARRAY; else if (strcmp(token.value, "map") == 0) token.type = TOKEN_MAP; else if (strcmp(token.value, "true") == 0) token.type = TOKEN_TRUE; else if (strcmp(token.value, "false") == 0) token.type = TOKEN_FALSE; else if (strcmp(token.value, "if") == 0) token.type = TOKEN_IF; else if (strcmp(token.value, "else") == 0) token.type = TOKEN_ELSE; else if (strcmp(token.value, "while") == 0) token.type = TOKEN_WHILE; else if (strcmp(token.value, "for") == 0) token.type = TOKEN_FOR; else if (strcmp(token.value, "break") == 0) token.type = TOKEN_BREAK; else if (strcmp(token.value, "continue") == 0) token.type = TOKEN_CONTINUE; else if (strcmp(token.value, "return") == 0) token.type = TOKEN_RETURN; else if (strcmp(token.value, "import") == 0) token.type = TOKEN_IMPORT; else if (strcmp(token.value, "as") == 0) token.type = TOKEN_AS; else if (strcmp(token.value, "and") == 0) token.type = TOKEN_AND; else if (strcmp(token.value, "or") == 0) token.type = TOKEN_OR; else if (strcmp(token.value, "in") == 0) token.type = TOKEN_IN; else if (strcmp(token.value, "native") == 0) token.type = TOKEN_NATIVE; else if (strcmp(token.value, "try") == 0) token.type = TOKEN_TRY; else if (strcmp(token.value, "catch") == 0) token.type = TOKEN_CATCH; else if (strcmp(token.value, "throw") == 0) token.type = TOKEN_THROW; else token.type = TOKEN_IDENTIFIER; return token; } // 读取数字 Token read_number(Lexer *lexer) { Token token; int pos = 0; int has_dot = 0; int has_exp = 0; int base = 10; // 默认十进制 token.line = lexer->line; token.column = lexer->column; // 检查是否是特殊进制数 if (lexer->source[lexer->current_pos] == '0') { char next = lexer->source[lexer->current_pos + 1]; if (next == 'b' || next == 'B') { // 二进制 base = 2; token.value[pos++] = '0'; token.value[pos++] = next; lexer->current_pos += 2; lexer->column += 2; // 读取二进制数字 while (lexer->source[lexer->current_pos] == '0' || lexer->source[lexer->current_pos] == '1') { if (pos < 255) { token.value[pos++] = lexer->source[lexer->current_pos]; } lexer->current_pos++; lexer->column++; } if (pos <= 2) { // 只有前缀没有数字 fprintf(stderr, "Error: Invalid binary number at line %d, column %d\n", lexer->line, lexer->column); token.type = TOKEN_ERROR; return token; } } else if (next == 'x' || next == 'X') { // 十六进制 base = 16; token.value[pos++] = '0'; token.value[pos++] = next; lexer->current_pos += 2; lexer->column += 2; // 读取十六进制数字 while (isxdigit(lexer->source[lexer->current_pos])) { if (pos < 255) { token.value[pos++] = lexer->source[lexer->current_pos]; } lexer->current_pos++; lexer->column++; } if (pos <= 2) { // 只有前缀没有数字 fprintf(stderr, "Error: Invalid hexadecimal number at line %d, column %d\n", lexer->line, lexer->column); token.type = TOKEN_ERROR; return token; } } else { // 八进制 base = 8; token.value[pos++] = '0'; lexer->current_pos++; // 跳过0 lexer->column++; // 读取后续的八进制数字 while (lexer->source[lexer->current_pos] >= '0' && lexer->source[lexer->current_pos] <= '7') { if (pos < 255) { token.value[pos++] = lexer->source[lexer->current_pos]; } lexer->current_pos++; lexer->column++; } // 检查是否后续字符是数字但超出八进制范围 if (isdigit(lexer->source[lexer->current_pos])) { fprintf(stderr, "Error: Invalid octal number at line %d, column %d\n", lexer->line, lexer->column); token.type = TOKEN_ERROR; return token; } } } // 如果不是特殊进制,按照十进制处理 if (base == 10) { // 读取整数部分 while (isdigit(lexer->source[lexer->current_pos])) { if (pos < 255) { token.value[pos++] = lexer->source[lexer->current_pos]; } lexer->current_pos++; lexer->column++; } // 处理小数点和小数部分 if (lexer->source[lexer->current_pos] == '.') { has_dot = 1; if (pos < 255) { token.value[pos++] = lexer->source[lexer->current_pos]; } lexer->current_pos++; lexer->column++; // 检查小数点后是否有数字 if (!isdigit(lexer->source[lexer->current_pos])) { fprintf(stderr, "Error: Expected digit after decimal point at line %d, column %d\n", lexer->line, lexer->column); token.type = TOKEN_ERROR; return token; } // 读取小数部分 while (isdigit(lexer->source[lexer->current_pos])) { if (pos < 255) { token.value[pos++] = lexer->source[lexer->current_pos]; } lexer->current_pos++; lexer->column++; } } // 处理科学计数法 if (lexer->source[lexer->current_pos] == 'e' || lexer->source[lexer->current_pos] == 'E') { has_exp = 1; if (pos < 255) { token.value[pos++] = lexer->source[lexer->current_pos]; } lexer->current_pos++; lexer->column++; // 处理指数的符号 if (lexer->source[lexer->current_pos] == '+' || lexer->source[lexer->current_pos] == '-') { if (pos < 255) { token.value[pos++] = lexer->source[lexer->current_pos]; } lexer->current_pos++; lexer->column++; } // 读取指数部分 if (!isdigit(lexer->source[lexer->current_pos])) { // 错误处理:科学计数法后面必须有数字 fprintf(stderr, "Error: Invalid scientific notation at line %d, column %d\n", lexer->line, lexer->column); token.type = TOKEN_ERROR; return token; } while (isdigit(lexer->source[lexer->current_pos])) { if (pos < 255) { token.value[pos++] = lexer->source[lexer->current_pos]; } lexer->current_pos++; lexer->column++; } } } token.value[pos] = '\0'; token.type = (has_dot || has_exp) ? TOKEN_FLOAT_LITERAL : TOKEN_INT_LITERAL; return token; } // 读取字符串 Token read_string(Lexer *lexer) { Token token; int pos = 0; token.type = TOKEN_STRING_LITERAL; token.line = lexer->line; token.column = lexer->column; lexer->current_pos++; // 跳过引号 lexer->column++; while (lexer->source[lexer->current_pos] != '"') { if (lexer->source[lexer->current_pos] == '\0') { token.type = TOKEN_ERROR; strcpy(token.value, "Unterminated string literal"); return token; } if (lexer->source[lexer->current_pos] == '\\') { lexer->current_pos++; lexer->column++; if (pos >= 255) continue; switch (lexer->source[lexer->current_pos]) { case 'n': token.value[pos++] = '\n'; break; case 't': token.value[pos++] = '\t'; break; case 'r': token.value[pos++] = '\r'; break; case '\"': token.value[pos++] = '\"'; break; case '\\': token.value[pos++] = '\\'; break; default: fprintf(stderr, "Error: Invalid escape sequence '\\%c' at line %d, column %d\n", lexer->source[lexer->current_pos], lexer->line, lexer->column); token.type = TOKEN_ERROR; return token; } } else if (pos < 255) { token.value[pos++] = lexer->source[lexer->current_pos]; } lexer->current_pos++; lexer->column++; } token.value[pos] = '\0'; if (lexer->source[lexer->current_pos] == '"') { lexer->current_pos++; lexer->column++; } else { token.type = TOKEN_ERROR; strcpy(token.value, "Unterminated string literal"); } return token; } Token lexer_next_token(Lexer *lexer) { skip_whitespace_and_comments(lexer); char c = lexer->source[lexer->current_pos]; Token token; token.line = lexer->line; token.column = lexer->column; token.value[0] = '\0'; if (c == '\0') { token.type = TOKEN_EOF; return token; } else if (isalpha(c) || (c == '_')) { return read_identifier(lexer); } else if (isdigit(c)) { return read_number(lexer); } else if (c == '"') { return read_string(lexer); } else { token.value[0] = c; token.value[1] = '\0'; lexer->current_pos++; lexer->column++; // 处理双字符和三字符操作符 char next_c = lexer->source[lexer->current_pos]; char next_next_c = (next_c != '\0') ? lexer->source[lexer->current_pos + 1] : '\0'; // 处理三字符操作符 if (c == '<' && next_c == '<' && next_next_c == '=') { token.value[1] = '<'; token.value[2] = '='; token.value[3] = '\0'; lexer->current_pos += 2; lexer->column += 2; token.type = TOKEN_SHL_ASSIGN; return token; } else if (c == '>' && next_c == '>' && next_next_c == '=') { token.value[1] = '>'; token.value[2] = '='; token.value[3] = '\0'; lexer->current_pos += 2; lexer->column += 2; token.type = TOKEN_SHR_ASSIGN; return token; } // 处理双字符操作符 if (c == '=' && next_c == '=') { token.value[1] = '='; token.value[2] = '\0'; lexer->current_pos++; lexer->column++; token.type = TOKEN_EQ_EQ; return token; } else if (c == '!' && next_c == '=') { token.value[1] = '='; token.value[2] = '\0'; lexer->current_pos++; lexer->column++; token.type = TOKEN_NE; return token; } else if (c == '<' && next_c == '=') { token.value[1] = '='; token.value[2] = '\0'; lexer->current_pos++; lexer->column++; token.type = TOKEN_LE; return token; } else if (c == '>' && next_c == '=') { token.value[1] = '='; token.value[2] = '\0'; lexer->current_pos++; lexer->column++; token.type = TOKEN_GE; return token; } else if (c == '+' && next_c == '=') { token.value[1] = '='; token.value[2] = '\0'; lexer->current_pos++; lexer->column++; token.type = TOKEN_PLUS_ASSIGN; return token; } else if (c == '-' && next_c == '=') { token.value[1] = '='; token.value[2] = '\0'; lexer->current_pos++; lexer->column++; token.type = TOKEN_MINUS_ASSIGN; return token; } else if (c == '*' && next_c == '=') { token.value[1] = '='; token.value[2] = '\0'; lexer->current_pos++; lexer->column++; token.type = TOKEN_MUL_ASSIGN; return token; } else if (c == '/' && next_c == '=') { token.value[1] = '='; token.value[2] = '\0'; lexer->current_pos++; lexer->column++; token.type = TOKEN_DIV_ASSIGN; return token; } else if (c == '%' && next_c == '=') { token.value[1] = '='; token.value[2] = '\0'; lexer->current_pos++; lexer->column++; token.type = TOKEN_MOD_ASSIGN; return token; } else if (c == '&' && next_c == '=') { token.value[1] = '='; token.value[2] = '\0'; lexer->current_pos++; lexer->column++; token.type = TOKEN_BITAND_ASSIGN; return token; } else if (c == '|' && next_c == '=') { token.value[1] = '='; token.value[2] = '\0'; lexer->current_pos++; lexer->column++; token.type = TOKEN_BITOR_ASSIGN; return token; } else if (c == '^' && next_c == '=') { token.value[1] = '='; token.value[2] = '\0'; lexer->current_pos++; lexer->column++; token.type = TOKEN_BITXOR_ASSIGN; return token; } else if (c == '<' && next_c == '<') { token.value[1] = '<'; token.value[2] = '\0'; lexer->current_pos++; lexer->column++; token.type = TOKEN_SHL; return token; } else if (c == '>' && next_c == '>') { token.value[1] = '>'; token.value[2] = '\0'; lexer->current_pos++; lexer->column++; token.type = TOKEN_SHR; return token; } else if (c == '+' && next_c == '+') { token.value[1] = '+'; token.value[2] = '\0'; lexer->current_pos++; lexer->column++; token.type = TOKEN_PLUS_PLUS; return token; } else if (c == '-' && next_c == '-') { token.value[1] = '-'; token.value[2] = '\0'; lexer->current_pos++; lexer->column++; token.type = TOKEN_MINUS_MINUS; return token; } else if (c == '&' && next_c == '&') { token.value[1] = '&'; token.value[2] = '\0'; lexer->current_pos++; lexer->column++; token.type = TOKEN_AND; return token; } else if (c == '|' && next_c == '|') { token.value[1] = '|'; token.value[2] = '\0'; lexer->current_pos++; lexer->column++; token.type = TOKEN_OR; return token; } switch (c) { case ':': token.type = TOKEN_COLON; break; case '=': token.type = TOKEN_EQ; break; case '(': token.type = TOKEN_LPAREN; break; case ')': token.type = TOKEN_RPAREN; break; case '{': token.type = TOKEN_LBRACE; break; case '}': token.type = TOKEN_RBRACE; break; case '[': token.type = TOKEN_LBRACKET; break; case ']': token.type = TOKEN_RBRACKET; break; case ',': token.type = TOKEN_COMMA; break; case ';': token.type = TOKEN_SEMICOLON; break; case '+': token.type = TOKEN_PLUS; break; case '-': token.type = TOKEN_MINUS; break; case '*': token.type = TOKEN_STAR; break; case '/': token.type = TOKEN_SLASH; break; case '.': token.type = TOKEN_DOT; break; case '<': token.type = TOKEN_LT; break; case '>': token.type = TOKEN_GT; break; case '!': token.type = TOKEN_BANG; break; case '|': token.type = TOKEN_BITOR; break; case '&': token.type = TOKEN_BITAND; break; case '^': token.type = TOKEN_BITXOR; break; case '%': token.type = TOKEN_MOD; break; case '@': token.type = TOKEN_AT; break; case '~': token.type = TOKEN_TILDE; break; case '?': token.type = TOKEN_QUESTION; break; default: token.type = TOKEN_ERROR; sprintf(token.value, "Unexpected character: %c", c); } return token; } }