#include "ccc.h" #include "lexer.h" #include #include #include #include #define LEXER_PANIC(format, ...) {\ fprintf(\ stderr,\ "ccc: lexer error: line %lu, column %lu: " format "\n",\ LINE,\ COL __VA_OPT__(,)\ __VA_ARGS__);\ exit(1);\ } static FILE* file = NULL; static int lookahead; static unsigned long LINE, COL; void lexer_load(const char* path) { if (file != NULL) { fclose(file); } file = fopen(path, "r"); if (file == NULL) CCC_PANIC; lookahead = fgetc(file); LINE = 1; COL = 1; } void lexer_close() { if (file == NULL) return; fclose(file); file = NULL; } bool lexer_peek(struct token* p_token) { if (file == NULL) return false; long orig_offset = ftell(file); int orig_lookahead = lookahead; bool rv = lexer_pop(p_token); lookahead = orig_lookahead; fseek(file, orig_offset, SEEK_SET); return rv; } #define is_whitespace(c) (c == ' ' || c == '\t' || c == '\n') #define is_lower_alpha(c) ('a' <= c && c <= 'z') #define is_upper_alpha(c) ('A' <= c && c <= 'Z') #define is_alpha(c) (is_lower_alpha(c) || is_upper_alpha(c)) #define is_numeric(c) ('0' <= c && c <= '9') #define is_alphanumeric(c) (is_alpha(c) || is_numeric(c)) #define is_hexadecimal(c) \ (is_numeric(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')) #define is_ident_legal(c) (is_alphanumeric(c) || c == '_' || c == '$') static int consume_char() { int rv = lookahead; lookahead = fgetc(file); COL++; return rv; } static void lex_ident(struct token* p_token, char ic) { char buf[1024] = {ic}; unsigned int len = 1; while (is_ident_legal(lookahead)) { int c = consume_char(); if (len >= sizeof(buf) - 1) LEXER_PANIC( "identifier exceeds maximum size (%ld)", sizeof(buf) - 1); buf[len++] = c; } buf[len] = 0; *p_token = (struct token) { .type = IDENTIFIER, .data.identifier = strndup(buf, sizeof(buf) - 1), }; } static unsigned char digit_val(int c, unsigned char base) { unsigned char c_val; if (is_numeric(c)) c_val = c - '0'; else if (is_lower_alpha(c)) c_val = c - 'a' + 10; else c_val = c - 'A' + 10; if (c_val >= base) LEXER_PANIC("invalid digit in base %hhu: %c", base, c); return c_val; } static void lex_float_lit( struct token* p_token, unsigned char base, float_lit_t iv ) { if (consume_char() != '.') LEXER_PANIC("sanity error, float literal without decimal point"); float_lit_t exp = 1.0; while (is_hexadecimal(lookahead)) { int c = consume_char(); exp /= base; iv += digit_val(c, base) * exp; } *p_token = (struct token) { .type = FLOAT_LIT, .data.float_lit = iv, }; } static void lex_int_lit(struct token* p_token, int_lit_t iv) { unsigned char base = 10; if (iv == 0) { if (lookahead == 'x' || lookahead == 'X' || lookahead == 'b' || lookahead == 'B') { base = (lookahead == 'x' || lookahead == 'X') ? 16 : 2; int suffix = consume_char(); if (!is_hexadecimal(lookahead)) LEXER_PANIC("invalid suffix on integer constant: %c", suffix); } else if (is_hexadecimal(lookahead)) base = 8; } while (is_hexadecimal(lookahead)) { int c = consume_char(); unsigned char c_val = digit_val(c, base); if (base < 15 && (c == 'e' || c == 'E')) LEXER_PANIC("exponentiation is not implemented"); if (ckd_mul(&iv, iv, base)) LEXER_PANIC("integer literal will overflow"); if (ckd_add(&iv, iv, c_val)) LEXER_PANIC("integer literal will overflow"); } if (lookahead == '.') { lex_float_lit(p_token, base, (float_lit_t) iv); return; } *p_token = (struct token) { .type = INT_LIT, .data.int_lit = iv, }; } static char replace_escape_sequence(char c) { switch (c) { case '\'': return '\''; case '\"': return '\"'; case '\\': return '\\'; case '?': return '?'; case 'r': return '\r'; case 'n': return '\n'; case 't': return '\t'; case 'v': return '\v'; case 'a': return '\a'; case 'b': return '\b'; case 'f': return '\f'; /* TODO: numeric escape sequences, e.g. \xff */ default: LEXER_PANIC("escape sequence not implemented"); } } static void lex_char_lit(struct token* p_token) { int c = consume_char(); if (c == EOF) LEXER_PANIC("unexpected EOF in char literal"); if (c == '\\') { c = consume_char(); if (c == EOF) LEXER_PANIC("unexpected EOF in char literal"); c = replace_escape_sequence(c); } int close_quote = consume_char(); if (close_quote == EOF) LEXER_PANIC("unexpected EOF in char literal"); if (close_quote != '\'') LEXER_PANIC( "expected end of char literal, not \"%c\"", close_quote); *p_token = (struct token) { .type = CHAR_LIT, .data.char_lit = c, }; } static void lex_str_lit(struct token* p_token) { if (lookahead == '"') { consume_char(); *p_token = (struct token) { .type = STR_LIT, .data.str_lit = strdup(""), }; return; } char buf[65536]; unsigned int len = 0; int c; for (;;) { c = consume_char(); if (c == '"') break; if (c == EOF || c == '\n') LEXER_PANIC("unterminated string literal"); if (c == '\\') { c = consume_char(); if (c == EOF) LEXER_PANIC("unterminated string literal"); c = replace_escape_sequence(c); } if (len >= sizeof(buf) - 1) LEXER_PANIC( "string literal exceeds maximum length (%ld)", sizeof(buf) - 1); buf[len++] = c; } buf[len] = 0; *p_token = (struct token) { .type = STR_LIT, .data.str_lit = strndup(buf, sizeof(buf) - 1), }; } static enum token_type two_char_operator_type(char c) { if (c == '!' && lookahead == '=') return NEQ; if (c == '^' && lookahead == '=') return XEQ; if (c == '&' && lookahead == '=') return AND_EQ; if (c == '&' && lookahead == '&') return LOG_AND; if (c == '*' && lookahead == '=') return MUL_EQ; if (c == '-' && lookahead == '=') return NEG_EQ; if (c == '-' && lookahead == '>') return ARROW; if (c == '=' && lookahead == '=') return TEST_EQ; if (c == '+' && lookahead == '=') return PLUS_EQ; if (c == '|' && lookahead == '|') return LOG_PIPE; if (c == '|' && lookahead == '=') return PIPE_EQ; if (c == '/' && lookahead == '=') return DIV_EQ; if (c == '%' && lookahead == '=') return MOD_EQ; if (c == '<' && lookahead == '=') return LEQ; if (c == '>' && lookahead == '=') return GEQ; if (c == '<' && lookahead == '<') return SHL; if (c == '>' && lookahead == '>') return SHR; return NOT_FOUND; } static bool lex_complex_operator(struct token* p_token, char c) { enum token_type type = two_char_operator_type(c); if (type == NOT_FOUND) return false; consume_char(); if (type == SHL && lookahead == '=') { consume_char(); type = SHL_EQ; } if (type == SHR && lookahead == '=') { consume_char(); type = SHR_EQ; } *p_token = (struct token) {.type = type}; return type; } static enum token_type lex_simple_operator(char c) { switch (c) { case '#': return HASHTAG; case '(': return LPAREN; case ')': return RPAREN; case '{': return LCURLY; case '}': return RCURLY; case '[': return LSQUARE; case ']': return RSQUARE; case ':': return COLON; case ';': return SEMI; case ',': return COMMA; case '.': return DOT; case '?': return QMARK; case '!': return NOT; case '^': return XOR; case '&': return AMP; case '*': return STAR; case '-': return NEG; case '=': return ASSIGN; case '+': return PLUS; case '\\': return BSLASH; case '|': return PIPE; case '/': return DIV; case '%': return MOD; case '<': return LT; case '>': return GT; } LEXER_PANIC("unexpected token %c", c); } bool lexer_pop(struct token* p_token) { if (file == NULL) return false; // consume all whitespace and comments preceding the next token int c; for (;;) { c = consume_char(); if (c == EOF) return false; else if (c == '/' && lookahead == '/') { // one of these while (lookahead != EOF && lookahead != '\n') consume_char(); } else if (c == '/' && lookahead == '*') { consume_char(); /* consume the * */ int c = consume_char(); while (c != EOF && (c != '*' || lookahead != '/')) c = consume_char(); if (c == EOF) LEXER_PANIC("unterminated /* comment"); consume_char(); /* consume the final slash */ } else if (c == '\n') { LINE++; COL = 1; } else if (!is_whitespace(c)) break; } if (is_numeric(c)) lex_int_lit(p_token, c - '0'); else if (c == '.' && is_numeric(lookahead)) lex_float_lit(p_token, 10, 0); else if (is_ident_legal(c)) lex_ident(p_token, c); else if (c == '\'') lex_char_lit(p_token); else if (c == '"') lex_str_lit(p_token); else if (!lex_complex_operator(p_token, c)) p_token->type = lex_simple_operator(c); return true; }