functional lexer

author: Carson Fleming <cflems@cflems.net> 2026-03-15 19:25:26 -0400
committer: Carson Fleming <cflems@cflems.net> 2026-03-15 19:25:26 -0400
commit: 66e278b0752eaa9808687c0dc214b49d7b58e8eb (patch)
tree: cca09f74ab439c3d54913bc1b4355b8208a5824f /lexer.c
parent: 1f85b418dd7960c28f16de21c44dcb4e2e05e694 (diff)
download: ccc-66e278b0752eaa9808687c0dc214b49d7b58e8eb.tar.gz
1 files changed, 138 insertions, 38 deletions
diff --git a/lexer.c b/lexer.c
index 7e2f5a4..a4ffd89 100644
--- a/lexer.c
+++ b/lexer.c
@@ -1,11 +1,23 @@
 #include "ccc.h"
 #include "lexer.h"
+#include <stdlib.h>
+#include <stdio.h>
 #include <string.h>
 #include <stdckdint.h>
 
+#define LEXER_PANIC(format, ...) {\
+    fprintf(\
+        stderr,\
+        "ccc: lexer error: line %lu, column %lu: " format "\n",\
+        LINE,\
+        COL __VA_OPT__(,)\
+        __VA_ARGS__);\
+    exit(1);\
+}
+
 static FILE* file = NULL;
 static int lookahead;
-static long LINE;
+static unsigned long LINE, COL;
 
 void lexer_load(const char* path) {
     if (file != NULL) {
@@ -16,6 +28,13 @@ void lexer_load(const char* path) {
 
     lookahead = fgetc(file);
     LINE = 1;
+    COL = 1;
+}
+
+void lexer_close() {
+    if (file == NULL) return;
+    fclose(file);
+    file = NULL;
 }
 
 bool lexer_peek(struct token* p_token) {
@@ -40,6 +59,7 @@ bool lexer_peek(struct token* p_token) {
 static int consume_char() {
     int rv = lookahead;
     lookahead = fgetc(file);
+    COL++;
     return rv;
 }
 
@@ -50,7 +70,8 @@ static void lex_ident(struct token* p_token, char ic) {
     while (is_ident_legal(lookahead)) {
         int c = consume_char();
         if (len >= sizeof(buf) - 1)
-            CCC_ERROR("identifier exceeds maximum size (%ld)", sizeof(buf));
+            LEXER_PANIC(
+                "identifier exceeds maximum size (%ld)", sizeof(buf) - 1);
         buf[len++] = c;
     }
 
@@ -66,10 +87,10 @@ static void lex_float_lit(
     unsigned char base,
     double iv
 ) {
-    CCC_ERROR("lexer: floating point literals are not supported yet");
+    LEXER_PANIC("floating point literals are not implemented");
 }
 
-static void lex_int_lit(struct token* p_token, intlit_t iv) {
+static void lex_int_lit(struct token* p_token, int_lit_t iv) {
     unsigned char base = 10;
 
     /* TODO: exponentiation, 2e10 f.e. */
@@ -79,31 +100,25 @@ static void lex_int_lit(struct token* p_token, intlit_t iv) {
             base = (lookahead == 'x' || lookahead == 'X') ? 16 : 2;
             int suffix = consume_char();
             if (!is_alphanumeric(lookahead))
-                CCC_ERROR(
-                    "lexer: invalid suffix on integer constant: %c", suffix);
+                LEXER_PANIC("invalid suffix on integer constant: %c", suffix);
         } else base = 8;
     }
 
     while (is_alphanumeric(lookahead)) {
         int c = consume_char();
-        intlit_t c_val;
+        int_lit_t c_val;
 
         if (is_numeric(c)) c_val = c - '0';
         else if (is_lower_alpha(c)) c_val = c - 'a' + 10;
         else c_val = c - 'A' + 10;
 
         if (c_val >= base)
-            CCC_ERROR(
-                "lexer: invalid digit in base %hhu: %c",
-                base,
-                c);
+            LEXER_PANIC("invalid digit in base %hhu: %c", base, c);
 
         if (ckd_mul(&iv, iv, base))
-            CCC_ERROR(
-                "lexer: integer literal will overflow");
+            LEXER_PANIC("integer literal will overflow");
         if (ckd_add(&iv, iv, c_val))
-            CCC_ERROR(
-                "lexer: integer literal will overflow");
+            LEXER_PANIC("integer literal will overflow");
     }
 
     if (lookahead == '.') {
@@ -118,24 +133,31 @@ static void lex_int_lit(struct token* p_token, intlit_t iv) {
     };
 }
 
+static char replace_escape_sequence(char c) {
+    if (c == '\'') return '\'';
+    else if (c == '\"') return '\"';
+    else if (c == '\\') return '\\';
+    else if (c == 'r') return '\r';
+    else if (c == 'n') return '\n';
+    else if (c == 't') return '\t';
+    else LEXER_PANIC("escape sequence not implemented");
+}
+
 static void lex_char_lit(struct token* p_token) {
     int c = consume_char();
-    if (c == EOF) CCC_ERROR("lexer: unexpected EOF in char literal");
+    if (c == EOF) LEXER_PANIC("unexpected EOF in char literal");
 
     if (c == '\\') {
         c = consume_char();
-        if (c == EOF) CCC_ERROR("lexer: unexpected EOF in char literal");
-
-        if (c == '\'') c = '\'';
-        else if (c == '\"') c = '\"';
-        else CCC_ERROR(
-            "lexer: escape sequences other than quotes are not supported yet");
+        if (c == EOF) LEXER_PANIC("unexpected EOF in char literal");
+        c = replace_escape_sequence(c);
     }
 
     int close_quote = consume_char();
-    if (close_quote == EOF) CCC_ERROR("lexer: unexpected EOF in char literal");
+    if (close_quote == EOF) LEXER_PANIC("unexpected EOF in char literal");
     if (close_quote != '\'')
-        CCC_ERROR("lexer: expected \"'\", not \"%c\"", close_quote);
+        LEXER_PANIC(
+            "expected end of char literal, not \"%c\"", close_quote);
 
     *p_token = (struct token) {
         .type = CHAR_LIT,
@@ -144,17 +166,82 @@ static void lex_char_lit(struct token* p_token) {
 }
 
 static void lex_str_lit(struct token* p_token) {
-    /* TODO: impl */
+    if (lookahead == '"') {
+        consume_char();
+        *p_token = (struct token) {
+            .type = STR_LIT,
+            .data.str_lit = strdup(""),
+        };
+        return;
+    }
+
+    char buf[65536];
+    unsigned int len = 0;
+    int c;
+    for (;;) {
+        c = consume_char();
+        if (c == '"') break;
+        if (c == EOF || c == '\n') LEXER_PANIC("unterminated string literal");
+
+        if (c == '\\') {
+            c = consume_char();
+            if (c == EOF) LEXER_PANIC("unterminated string literal");
+            c = replace_escape_sequence(c);
+        }
+
+        if (len >= sizeof(buf) - 1)
+            LEXER_PANIC(
+                "string literal exceeds maximum length (%ld)",
+                sizeof(buf) - 1);
+        buf[len++] = c;
+    }
+    buf[len] = 0;
+
+    *p_token = (struct token) {
+        .type = STR_LIT,
+        .data.str_lit = strndup(buf, sizeof(buf) - 1),
+    };
+}
+
+static enum token_type two_char_operator_type(char c) {
+    if (c == '!' && lookahead == '=') return NEQ;
+    if (c == '^' && lookahead == '=') return XEQ;
+    if (c == '&' && lookahead == '=') return AND_EQ;
+    if (c == '&' && lookahead == '&') return LOG_AND;
+    if (c == '*' && lookahead == '=') return MUL_EQ;
+    if (c == '-' && lookahead == '=') return NEG_EQ;
+    if (c == '-' && lookahead == '>') return ARROW;
+    if (c == '=' && lookahead == '=') return TEST_EQ;
+    if (c == '+' && lookahead == '=') return PLUS_EQ;
+    if (c == '|' && lookahead == '|') return LOG_PIPE;
+    if (c == '|' && lookahead == '=') return PIPE_EQ;
+    if (c == '/' && lookahead == '=') return DIV_EQ;
+    if (c == '%' && lookahead == '=') return MOD_EQ;
+    if (c == '<' && lookahead == '=') return LEQ;
+    if (c == '>' && lookahead == '=') return GEQ;
+    if (c == '<' && lookahead == '<') return SHL;
+    if (c == '>' && lookahead == '>') return SHR;
+    return NOT_FOUND;
 }
 
-static bool lex_complex_operator(enum token_type* p_token_type, char c) {
-    /* TODO: impl 2 char operators */
-    return false;
+static bool lex_complex_operator(struct token* p_token, char c) {
+    enum token_type type = two_char_operator_type(c);
+    if (type == NOT_FOUND) return false;
+    consume_char();
+    if (type == SHL && lookahead == '=') {
+        consume_char();
+        type = SHL_EQ;
+    }
+    if (type == SHR && lookahead == '=') {
+        consume_char();
+        type = SHR_EQ;
+    }
+    *p_token = (struct token) {.type = type};
+    return type;
 }
 
 static enum token_type lex_simple_operator(char c) {
     switch (c) {
-        case '*': return STAR;
         case '#': return HASHTAG;
         case '(': return LPAREN;
         case ')': return RPAREN;
@@ -167,22 +254,32 @@ static enum token_type lex_simple_operator(char c) {
         case ',': return COMMA;
         case '.': return DOT;
         case '?': return QMARK;
-        /* TODO: fill in */
+        case '!': return NOT;
+        case '^': return XOR;
+        case '&': return AMP;
+        case '*': return STAR;
+        case '-': return NEG;
+        case '=': return ASSIGN;
+        case '+': return PLUS;
+        case '\\': return BSLASH;
+        case '|': return PIPE;
+        case '/': return DIV;
+        case '%': return MOD;
+        case '<': return LT;
+        case '>': return GT;
     }
-    CCC_ERROR("lexer: unexpected token %c", c);
+    LEXER_PANIC("unexpected token %c", c);
 }
 
 bool lexer_pop(struct token* p_token) {
-    /* TODO: e.g. float f = .25; */
     if (file == NULL) return false;
 
     // consume all whitespace and comments preceding the next token
     int c;
     for (;;) {
         c = consume_char();
-        // one of these
         if (c == EOF) return false;
-        else if (c == '/' && lookahead == '/') {
+        else if (c == '/' && lookahead == '/') { // one of these
             while (lookahead != EOF && lookahead != '\n') consume_char();
         }
         else if (c == '/' && lookahead == '*') {
@@ -190,10 +287,13 @@ bool lexer_pop(struct token* p_token) {
             int c = consume_char();
             while (c != EOF && (c != '*' || lookahead != '/'))
                 c = consume_char();
-            if (c == EOF) CCC_ERROR("unterminated /* comment");
-            consume_char(); /* consume the final / */
+            if (c == EOF) LEXER_PANIC("unterminated /* comment");
+            consume_char(); /* consume the final slash */
+        }
+        else if (c == '\n') {
+            LINE++;
+            COL = 1;
         }
-        else if (c == '\n') LINE++;
         else if (!is_whitespace(c)) break;
     }
     
@@ -207,7 +307,7 @@ bool lexer_pop(struct token* p_token) {
         lex_char_lit(p_token);
     else if (c == '"')
         lex_str_lit(p_token);
-    else if (!lex_complex_operator(&p_token->type, c))
+    else if (!lex_complex_operator(p_token, c))
         p_token->type = lex_simple_operator(c);
 
     return true;
author	Carson Fleming <cflems@cflems.net>	2026-03-15 19:25:26 -0400
committer	Carson Fleming <cflems@cflems.net>	2026-03-15 19:25:26 -0400
commit	66e278b0752eaa9808687c0dc214b49d7b58e8eb (patch)
tree	cca09f74ab439c3d54913bc1b4355b8208a5824f /lexer.c
parent	1f85b418dd7960c28f16de21c44dcb4e2e05e694 (diff)
download	ccc-66e278b0752eaa9808687c0dc214b49d7b58e8eb.tar.gz