From 7a361c2e7385c2e670a0e2cc8d9092814ea17253 Mon Sep 17 00:00:00 2001
From: Carson Fleming <cflems@cflems.net>
Date: Fri, 13 Mar 2026 01:05:34 -0400
Subject: not even compiled once but we ball

---
 .clangd    |   4 ++
 .gitignore |   6 ++
 README.md  |   3 +
 ccc.h      |  12 ++++
 lexer.c    | 210 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 lexer.h    |  72 +++++++++++++++++++++
 main.c     |   0
 7 files changed, 307 insertions(+)
 create mode 100644 .clangd
 create mode 100644 .gitignore
 create mode 100644 README.md
 create mode 100644 ccc.h
 create mode 100644 lexer.c
 create mode 100644 lexer.h
 create mode 100644 main.c

diff --git a/.clangd b/.clangd
new file mode 100644
index 0000000..f52a76d
--- /dev/null
+++ b/.clangd
@@ -0,0 +1,4 @@
+CompileFlags:
+    Add:
+        - "-xc"
+        - "-std=c23"
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..4c0c7e0
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,6 @@
+*.o
+*.out
+build/**
+.*
+!.git*
+!.clangd
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..c979324
--- /dev/null
+++ b/README.md
@@ -0,0 +1,3 @@
+# Carson's C Compiler
+
+fuck it, we ball
diff --git a/ccc.h b/ccc.h
new file mode 100644
index 0000000..6b41480
--- /dev/null
+++ b/ccc.h
@@ -0,0 +1,12 @@
+#ifndef CCC_H
+#define CCC_H
+#include <stdio.h>
+#include <stdlib.h>
+
+#define CCC_PANIC { perror("ccc"); exit(1); }
+#define CCC_ERROR(format, ...) {\
+    fprintf(stderr, "line %ld: " format "\n", LINE __VA_OPT__(,) __VA_ARGS__);\
+    exit(1);\
+}
+
+#endif
diff --git a/lexer.c b/lexer.c
new file mode 100644
index 0000000..04aada4
--- /dev/null
+++ b/lexer.c
@@ -0,0 +1,210 @@
+#include "ccc.h"
+#include "lexer.h"
+#include <string.h>
+#include <stdckdint.h>
+
+static FILE* file = NULL;
+static int lookahead;
+static long LINE;
+
+void lexer_load(const char* path) {
+    if (file != NULL) {
+        fclose(file);
+    }
+    file = fopen(path, "r");
+    if (file == NULL) CCC_PANIC;
+
+    lookahead = fgetc(file);
+    LINE = 1;
+}
+
+bool lexer_peek(struct token* p_token) {
+    if (file == NULL) return false;
+
+    long orig_offset = ftell(file);
+    int orig_lookahead = lookahead;
+    bool rv = lexer_pop(p_token);
+    lookahead = orig_lookahead;
+    fseek(file, orig_offset, SEEK_SET);
+    return rv;
+}
+
+#define is_whitespace(c) (c == ' ' || c == '\t' || c == '\n')
+#define is_lower_alpha(c) ('a' <= c && c <= 'z')
+#define is_upper_alpha(c) ('A' <= c && c <= 'Z')
+#define is_alpha(c) (is_lower_alpha(c) || is_upper_alpha(c))
+#define is_numeric(c) ('0' <= c && c <= '9')
+#define is_alphanumeric(c) (is_alpha(c) || is_numeric(c))
+#define is_ident_legal(c) (is_alphanumeric(c) || c == '_' || c == '$')
+
+#define REFUND_CHAR fseek(file, -1, SEEK_CUR)
+
+static int consume_char() {
+    int rv = lookahead;
+    lookahead = fgetc(file);
+    return rv;
+}
+
+static void lex_ident(struct token* p_token, char ic) {
+    char buf[1024] = {ic};
+    unsigned int len = 1;
+
+    while (is_ident_legal(lookahead)) {
+        int c = consume_char();
+        if (len >= sizeof(buf) - 1)
+            CCC_ERROR("identifier exceeds maximum size (%ld)", sizeof(buf));
+        buf[len++] = c;
+    }
+
+    buf[len] = 0;
+    *p_token = (struct token) {
+        .type = IDENTIFIER,
+        .data.identifier = strndup(buf, sizeof(buf) - 1),
+    };
+}
+
+static void lex_float_lit(
+    struct token* p_token,
+    unsigned char base,
+    double iv
+) {
+    CCC_ERROR("lexer: floating point literals are not supported yet");
+}
+
+static void lex_int_lit(struct token* p_token, intlit_t iv) {
+    unsigned char base = 10;
+
+    /* TODO: exponentiation, 2e10 f.e. */
+    if (iv == 0) {
+        if (lookahead == 'x' || lookahead == 'X'
+                || lookahead == 'b' || lookahead == 'B') {
+            base = (lookahead == 'x' || lookahead == 'X') ? 16 : 2;
+            int suffix = consume_char();
+            if (!is_alphanumeric(lookahead))
+                CCC_ERROR(
+                    "lexer: invalid suffix on integer constant: %c", suffix);
+        } else base = 8;
+    }
+
+    while (is_alphanumeric(lookahead)) {
+        int c = consume_char();
+        intlit_t c_val;
+
+        if (is_numeric(c)) c_val = c - '0';
+        else if (is_lower_alpha(c)) c_val = c - 'a' + 10;
+        else c_val = c - 'A' + 10;
+
+        if (c_val >= base)
+            CCC_ERROR(
+                "lexer: invalid digit in base %hhu: %c",
+                base,
+                c);
+
+        if (ckd_mul(&iv, iv, base))
+            CCC_ERROR(
+                "lexer: integer literal will overflow");
+        if (ckd_add(&iv, iv, c_val))
+            CCC_ERROR(
+                "lexer: integer literal will overflow");
+    }
+
+    if (lookahead == '.') {
+        consume_char();
+        lex_float_lit(p_token, base, iv);
+        return;
+    }
+
+    *p_token = (struct token) {
+        .type = INT_LIT,
+        .data.int_lit = iv,
+    };
+}
+
+static void lex_char_lit(struct token* p_token) {
+    int c = consume_char();
+    if (c == EOF) CCC_ERROR("lexer: unexpected EOF in char literal");
+
+    if (c == '\\') {
+        c = consume_char();
+        if (c == EOF) CCC_ERROR("lexer: unexpected EOF in char literal");
+
+        if (c == '\'') c = '\'';
+        else if (c == '\"') c = '\"';
+        else CCC_ERROR(
+            "lexer: escape sequences other than quotes are not supported yet");
+    }
+
+    int close_quote = consume_char();
+    if (close_quote == EOF) CCC_ERROR("lexer: unexpected EOF in char literal");
+    if (close_quote != '\'')
+        CCC_ERROR("lexer: expected \"'\", not \"%c\"", close_quote);
+
+    *p_token = (struct token) {
+        .type = CHAR_LIT,
+        .data.char_lit = c,
+    };
+}
+
+static void lex_str_lit(struct token* p_token) {
+
+}
+
+enum token_type lex_simple(char c) {
+    switch (c) {
+        case '*': return STAR; /* TODO: *= */
+        case '#': return HASHTAG;
+        case '(': return LPAREN;
+        case ')': return RPAREN;
+        case '{': return LCURLY;
+        case '}': return RCURLY;
+        case '[': return LSQUARE;
+        case ']': return RSQUARE;
+        case ':': return COLON;
+        case ';': return SEMI;
+        case ',': return COMMA;
+        case '.': return DOT;
+        case '?': return QMARK;
+    }
+    CCC_ERROR("lexer: unexpected token %c", c);
+}
+
+bool lexer_pop(struct token* p_token) {
+    /* TODO: e.g. float f = .25; */
+    if (file == NULL) return false;
+
+    // consume all whitespace and comments preceding the next token
+    int c;
+    for (;;) {
+        c = consume_char();
+        // one of these
+        if (c == EOF) return false;
+        else if (c == '/' && lookahead == '/') {
+            while (lookahead != EOF && lookahead != '\n') consume_char();
+        }
+        else if (c == '/' && lookahead == '*') {
+            consume_char(); /* consume the * */
+            int c = consume_char();
+            while (c != EOF && (c != '*' || lookahead != '/'))
+                c = consume_char();
+            if (c == EOF) CCC_ERROR("unterminated /* comment");
+            consume_char(); /* consume the final / */
+        }
+        else if (c == '\n') LINE++;
+        else if (!is_whitespace(c)) break;
+    }
+    
+    if (is_numeric(c))
+        lex_int_lit(p_token, c - '0');
+    else if (c == '.' && is_numeric(lookahead))
+        lex_float_lit(p_token, 10, 0);
+    else if (is_ident_legal(c))
+        lex_ident(p_token, c);
+    else if (c == '\'')
+        lex_char_lit(p_token);
+    else if (c == '"')
+        lex_str_lit(p_token);
+    else
+        *p_token = (struct token) {.type = lex_simple(c)};
+
+    return true;
+}
diff --git a/lexer.h b/lexer.h
new file mode 100644
index 0000000..62ee9c2
--- /dev/null
+++ b/lexer.h
@@ -0,0 +1,72 @@
+#ifndef LEXER_H
+#define LEXER_H
+
+enum token_type {
+    IDENTIFIER,
+    INT_LIT,
+    CHAR_LIT,
+    STR_LIT,
+    HASHTAG,
+    LPAREN,
+    RPAREN,
+    LCURLY,
+    RCURLY,
+    LSQUARE,
+    RSQUARE,
+    COLON,
+    SEMI,
+    COMMA,
+    DOT,
+    QMARK,
+    NOT,
+    NEQ,
+    XOR,
+    XEQ,
+    AMP,
+    LOG_AND,
+    AND_EQ,
+    STAR,
+    MUL_EQ,
+    NEG,
+    NEG_EQ,
+    ARROW,
+    ASSIGN,
+    TEST_EQ,
+    PLUS,
+    PLUS_EQ,
+    BSLASH,
+    PIPE,
+    LOG_PIPE,
+    PIPE_EQ,
+    DIV,
+    DIV_EQ, // comments too
+    LT,
+    GT,
+    LEQ,
+    GEQ,
+    SHR,
+    SHR_EQ,
+    SHL,
+    SHL_EQ
+    /* more to come */
+    // ->, everything that can precede = (multi-symbols)
+};
+
+typedef unsigned long long intlit_t;
+
+struct token {
+    enum token_type type;
+    union {
+        char* identifier;
+        intlit_t int_lit;
+        char char_lit;
+        char* str_lit;
+        void* unused;
+    } data;
+};
+
+void lexer_load(const char* path);
+bool lexer_peek(struct token* p_token);
+bool lexer_pop(struct token* p_token);
+
+#endif
diff --git a/main.c b/main.c
new file mode 100644
index 0000000..e69de29
-- 
cgit v1.2.3