#include "ccc.h" #include "lexer.h" #include #include static FILE* file = NULL; static int lookahead; static long LINE; void lexer_load(const char* path) { if (file != NULL) { fclose(file); } file = fopen(path, "r"); if (file == NULL) CCC_PANIC; lookahead = fgetc(file); LINE = 1; } bool lexer_peek(struct token* p_token) { if (file == NULL) return false; long orig_offset = ftell(file); int orig_lookahead = lookahead; bool rv = lexer_pop(p_token); lookahead = orig_lookahead; fseek(file, orig_offset, SEEK_SET); return rv; } #define is_whitespace(c) (c == ' ' || c == '\t' || c == '\n') #define is_lower_alpha(c) ('a' <= c && c <= 'z') #define is_upper_alpha(c) ('A' <= c && c <= 'Z') #define is_alpha(c) (is_lower_alpha(c) || is_upper_alpha(c)) #define is_numeric(c) ('0' <= c && c <= '9') #define is_alphanumeric(c) (is_alpha(c) || is_numeric(c)) #define is_ident_legal(c) (is_alphanumeric(c) || c == '_' || c == '$') static int consume_char() { int rv = lookahead; lookahead = fgetc(file); return rv; } static void lex_ident(struct token* p_token, char ic) { char buf[1024] = {ic}; unsigned int len = 1; while (is_ident_legal(lookahead)) { int c = consume_char(); if (len >= sizeof(buf) - 1) CCC_ERROR("identifier exceeds maximum size (%ld)", sizeof(buf)); buf[len++] = c; } buf[len] = 0; *p_token = (struct token) { .type = IDENTIFIER, .data.identifier = strndup(buf, sizeof(buf) - 1), }; } static void lex_float_lit( struct token* p_token, unsigned char base, double iv ) { CCC_ERROR("lexer: floating point literals are not supported yet"); } static void lex_int_lit(struct token* p_token, intlit_t iv) { unsigned char base = 10; /* TODO: exponentiation, 2e10 f.e. */ if (iv == 0) { if (lookahead == 'x' || lookahead == 'X' || lookahead == 'b' || lookahead == 'B') { base = (lookahead == 'x' || lookahead == 'X') ? 16 : 2; int suffix = consume_char(); if (!is_alphanumeric(lookahead)) CCC_ERROR( "lexer: invalid suffix on integer constant: %c", suffix); } else base = 8; } while (is_alphanumeric(lookahead)) { int c = consume_char(); intlit_t c_val; if (is_numeric(c)) c_val = c - '0'; else if (is_lower_alpha(c)) c_val = c - 'a' + 10; else c_val = c - 'A' + 10; if (c_val >= base) CCC_ERROR( "lexer: invalid digit in base %hhu: %c", base, c); if (ckd_mul(&iv, iv, base)) CCC_ERROR( "lexer: integer literal will overflow"); if (ckd_add(&iv, iv, c_val)) CCC_ERROR( "lexer: integer literal will overflow"); } if (lookahead == '.') { consume_char(); lex_float_lit(p_token, base, iv); return; } *p_token = (struct token) { .type = INT_LIT, .data.int_lit = iv, }; } static void lex_char_lit(struct token* p_token) { int c = consume_char(); if (c == EOF) CCC_ERROR("lexer: unexpected EOF in char literal"); if (c == '\\') { c = consume_char(); if (c == EOF) CCC_ERROR("lexer: unexpected EOF in char literal"); if (c == '\'') c = '\''; else if (c == '\"') c = '\"'; else CCC_ERROR( "lexer: escape sequences other than quotes are not supported yet"); } int close_quote = consume_char(); if (close_quote == EOF) CCC_ERROR("lexer: unexpected EOF in char literal"); if (close_quote != '\'') CCC_ERROR("lexer: expected \"'\", not \"%c\"", close_quote); *p_token = (struct token) { .type = CHAR_LIT, .data.char_lit = c, }; } static void lex_str_lit(struct token* p_token) { } enum token_type lex_simple(char c) { switch (c) { case '*': return STAR; /* TODO: *= */ case '#': return HASHTAG; case '(': return LPAREN; case ')': return RPAREN; case '{': return LCURLY; case '}': return RCURLY; case '[': return LSQUARE; case ']': return RSQUARE; case ':': return COLON; case ';': return SEMI; case ',': return COMMA; case '.': return DOT; case '?': return QMARK; } CCC_ERROR("lexer: unexpected token %c", c); } bool lexer_pop(struct token* p_token) { /* TODO: e.g. float f = .25; */ if (file == NULL) return false; // consume all whitespace and comments preceding the next token int c; for (;;) { c = consume_char(); // one of these if (c == EOF) return false; else if (c == '/' && lookahead == '/') { while (lookahead != EOF && lookahead != '\n') consume_char(); } else if (c == '/' && lookahead == '*') { consume_char(); /* consume the * */ int c = consume_char(); while (c != EOF && (c != '*' || lookahead != '/')) c = consume_char(); if (c == EOF) CCC_ERROR("unterminated /* comment"); consume_char(); /* consume the final / */ } else if (c == '\n') LINE++; else if (!is_whitespace(c)) break; } if (is_numeric(c)) lex_int_lit(p_token, c - '0'); else if (c == '.' && is_numeric(lookahead)) lex_float_lit(p_token, 10, 0); else if (is_ident_legal(c)) lex_ident(p_token, c); else if (c == '\'') lex_char_lit(p_token); else if (c == '"') lex_str_lit(p_token); else *p_token = (struct token) {.type = lex_simple(c)}; return true; }