#include "ccc.h" #include "lexer.h" #include #include #include #include static FILE* file = NULL; static int lookahead; static const char* PATH; static unsigned long LINE, COL; static struct token tok = {.type = TK_NOT_FOUND}; #define LEXER_PANIC(format, ...) {\ fprintf(\ stderr,\ "ccc: lexer error: %s: line %lu, column %lu: " format "\n",\ PATH,\ LINE,\ COL __VA_OPT__(,)\ __VA_ARGS__);\ exit(1);\ } static void lexer_advance(); void lexer_load(const char* path) { if (file != NULL) { fclose(file); } file = fopen(path, "r"); if (file == NULL) CCC_PANIC; lookahead = fgetc(file); PATH = path; LINE = 1; COL = 1; lexer_advance(); } void lexer_close() { if (file == NULL) return; fclose(file); file = NULL; } bool lexer_peek(struct token* p_token) { if (tok.type == TK_NOT_FOUND) return false; if (p_token != NULL) *p_token = tok; return true; } #define is_whitespace(c) (c == ' ' || c == '\t' || c == '\n') #define is_lower_alpha(c) ('a' <= c && c <= 'z') #define is_upper_alpha(c) ('A' <= c && c <= 'Z') #define is_alpha(c) (is_lower_alpha(c) || is_upper_alpha(c)) #define is_numeric(c) ('0' <= c && c <= '9') #define is_alphanumeric(c) (is_alpha(c) || is_numeric(c)) #define is_hexadecimal(c) \ (is_numeric(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')) #define is_ident_legal(c) (is_alphanumeric(c) || c == '_' || c == '$') static int consume_char() { int rv = lookahead; lookahead = fgetc(file); COL++; return rv; } static void lex_ident(struct token* p_token, char ic) { char buf[1024] = {ic}; unsigned int len = 1; while (is_ident_legal(lookahead)) { int c = consume_char(); if (len >= sizeof(buf) - 1) LEXER_PANIC( "identifier exceeds maximum size (%ld)", sizeof(buf) - 1); buf[len++] = c; } buf[len] = 0; *p_token = (struct token) { .type = TK_IDENT, .data.ident = strndup(buf, sizeof(buf) - 1), .PATH = PATH, .LINE = LINE, .COL = COL, }; } static unsigned char digit_val(int c, unsigned char base) { unsigned char c_val; if (is_numeric(c)) c_val = c - '0'; else if (is_lower_alpha(c)) c_val = c - 'a' + 10; else c_val = c - 'A' + 10; if (c_val >= base) LEXER_PANIC("invalid digit in base %hhu: %c", base, c); return c_val; } static void lex_float_lit( struct token* p_token, unsigned char base, float_lit_t iv ) { if (consume_char() != '.') LEXER_PANIC("sanity error, float literal without decimal point"); float_lit_t exp = 1.0; while (is_hexadecimal(lookahead)) { int c = consume_char(); exp /= base; iv += digit_val(c, base) * exp; } *p_token = (struct token) { .type = TK_FLOAT_LIT, .data.float_lit = iv, .PATH = PATH, .LINE = LINE, .COL = COL, }; } static void lex_int_lit(struct token* p_token, int_lit_t iv) { unsigned char base = 10; if (iv == 0) { if (lookahead == 'x' || lookahead == 'X' || lookahead == 'b' || lookahead == 'B') { base = (lookahead == 'x' || lookahead == 'X') ? 16 : 2; int suffix = consume_char(); if (!is_hexadecimal(lookahead)) LEXER_PANIC("invalid suffix on integer constant: %c", suffix); } else if (is_hexadecimal(lookahead)) base = 8; } while (is_hexadecimal(lookahead)) { int c = consume_char(); unsigned char c_val = digit_val(c, base); if (base < 15 && (c == 'e' || c == 'E')) LEXER_PANIC("exponentiation is not implemented"); if (ckd_mul(&iv, iv, base)) LEXER_PANIC("integer literal will overflow"); if (ckd_add(&iv, iv, c_val)) LEXER_PANIC("integer literal will overflow"); } if (lookahead == '.') { lex_float_lit(p_token, base, (float_lit_t) iv); return; } *p_token = (struct token) { .type = TK_INT_LIT, .data.int_lit = iv, .PATH = PATH, .LINE = LINE, .COL = COL, }; } static char replace_escape_sequence(char c) { switch (c) { case '\'': return '\''; case '\"': return '\"'; case '\\': return '\\'; case '?': return '?'; case 'r': return '\r'; case 'n': return '\n'; case 't': return '\t'; case 'v': return '\v'; case 'a': return '\a'; case 'b': return '\b'; case 'f': return '\f'; /* TODO: numeric escape sequences, e.g. \xff */ default: LEXER_PANIC("escape sequence not implemented"); } } static void lex_char_lit(struct token* p_token) { int c = consume_char(); if (c == EOF) LEXER_PANIC("unexpected EOF in char literal"); if (c == '\\') { c = consume_char(); if (c == EOF) LEXER_PANIC("unexpected EOF in char literal"); c = replace_escape_sequence(c); } int close_quote = consume_char(); if (close_quote == EOF) LEXER_PANIC("unexpected EOF in char literal"); if (close_quote != '\'') LEXER_PANIC( "expected end of char literal, not \"%c\"", close_quote); *p_token = (struct token) { .type = TK_CHAR_LIT, .data.char_lit = c, .PATH = PATH, .LINE = LINE, .COL = COL, }; } static void lex_str_lit(struct token* p_token) { if (lookahead == '"') { consume_char(); *p_token = (struct token) { .type = TK_STR_LIT, .data.str_lit = strdup(""), .PATH = PATH, .LINE = LINE, .COL = COL, }; return; } char buf[65536]; unsigned int len = 0; int c; for (;;) { c = consume_char(); if (c == '"') break; if (c == EOF || c == '\n') LEXER_PANIC("unterminated string literal"); if (c == '\\') { c = consume_char(); if (c == EOF) LEXER_PANIC("unterminated string literal"); c = replace_escape_sequence(c); } if (len >= sizeof(buf) - 1) LEXER_PANIC( "string literal exceeds maximum length (%ld)", sizeof(buf) - 1); buf[len++] = c; } buf[len] = 0; *p_token = (struct token) { .type = TK_STR_LIT, .data.str_lit = strndup(buf, sizeof(buf) - 1), .PATH = PATH, .LINE = LINE, .COL = COL, }; } static enum token_type two_char_operator_type(char c) { if (c == '!' && lookahead == '=') return TK_NEQ; if (c == '^' && lookahead == '=') return TK_XEQ; if (c == '&' && lookahead == '=') return TK_AND_EQ; if (c == '&' && lookahead == '&') return TK_LOG_AND; if (c == '*' && lookahead == '=') return TK_MUL_EQ; if (c == '-' && lookahead == '=') return TK_NEG_EQ; if (c == '-' && lookahead == '>') return TK_ARROW; if (c == '=' && lookahead == '=') return TK_TEST_EQ; if (c == '+' && lookahead == '=') return TK_PLUS_EQ; if (c == '|' && lookahead == '|') return TK_LOG_PIPE; if (c == '|' && lookahead == '=') return TK_PIPE_EQ; if (c == '/' && lookahead == '=') return TK_DIV_EQ; if (c == '%' && lookahead == '=') return TK_MOD_EQ; if (c == '<' && lookahead == '=') return TK_LEQ; if (c == '>' && lookahead == '=') return TK_GEQ; if (c == '<' && lookahead == '<') return TK_SHL; if (c == '>' && lookahead == '>') return TK_SHR; return TK_NOT_FOUND; } static bool lex_complex_operator(struct token* p_token, char c) { enum token_type type = two_char_operator_type(c); if (type == TK_NOT_FOUND) return false; consume_char(); if (type == TK_SHL && lookahead == '=') { consume_char(); type = TK_SHL_EQ; } if (type == TK_SHR && lookahead == '=') { consume_char(); type = TK_SHR_EQ; } *p_token = (struct token) { .type = type, .PATH = PATH, .LINE = LINE, .COL = COL, }; return type; } static enum token_type lex_simple_operator(char c) { switch (c) { case '#': return TK_HASHTAG; case '(': return TK_LPAREN; case ')': return TK_RPAREN; case '{': return TK_LCURLY; case '}': return TK_RCURLY; case '[': return TK_LSQUARE; case ']': return TK_RSQUARE; case ':': return TK_COLON; case ';': return TK_SEMI; case ',': return TK_COMMA; case '.': return TK_DOT; case '?': return TK_QMARK; case '!': return TK_NOT; case '^': return TK_XOR; case '&': return TK_AMP; case '*': return TK_STAR; case '-': return TK_NEG; case '=': return TK_ASSIGN; case '+': return TK_PLUS; case '\\': return TK_BSLASH; case '|': return TK_PIPE; case '/': return TK_DIV; case '%': return TK_MOD; case '<': return TK_LT; case '>': return TK_GT; } LEXER_PANIC("unexpected token %c", c); } static bool lexer_read() { if (file == NULL) return false; // consume all whitespace and comments preceding the next token int c; for (;;) { c = consume_char(); if (c == EOF) return false; else if (c == '/' && lookahead == '/') { // one of these while (lookahead != EOF && lookahead != '\n') consume_char(); } else if (c == '/' && lookahead == '*') { consume_char(); /* consume the * */ int c = consume_char(); while (c != EOF && (c != '*' || lookahead != '/')) c = consume_char(); if (c == EOF) LEXER_PANIC("unterminated /* comment"); consume_char(); /* consume the final slash */ } else if (c == '\n') { LINE++; COL = 1; } else if (!is_whitespace(c)) break; } if (is_numeric(c)) lex_int_lit(&tok, c - '0'); else if (c == '.' && is_numeric(lookahead)) lex_float_lit(&tok, 10, 0); else if (is_ident_legal(c)) lex_ident(&tok, c); else if (c == '\'') lex_char_lit(&tok); else if (c == '"') lex_str_lit(&tok); else if (!lex_complex_operator(&tok, c)) tok.type = lex_simple_operator(c); return true; } static void lexer_advance() { if (!lexer_read()) tok.type = TK_NOT_FOUND; } bool lexer_pop(struct token* p_token) { if (tok.type == TK_NOT_FOUND) return false; if (p_token != NULL) *p_token = tok; lexer_advance(); return true; }