From 7a361c2e7385c2e670a0e2cc8d9092814ea17253 Mon Sep 17 00:00:00 2001 From: Carson Fleming Date: Fri, 13 Mar 2026 01:05:34 -0400 Subject: not even compiled once but we ball --- .clangd | 4 ++ .gitignore | 6 ++ README.md | 3 + ccc.h | 12 ++++ lexer.c | 210 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ lexer.h | 72 +++++++++++++++++++++ main.c | 0 7 files changed, 307 insertions(+) create mode 100644 .clangd create mode 100644 .gitignore create mode 100644 README.md create mode 100644 ccc.h create mode 100644 lexer.c create mode 100644 lexer.h create mode 100644 main.c diff --git a/.clangd b/.clangd new file mode 100644 index 0000000..f52a76d --- /dev/null +++ b/.clangd @@ -0,0 +1,4 @@ +CompileFlags: + Add: + - "-xc" + - "-std=c23" diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4c0c7e0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +*.o +*.out +build/** +.* +!.git* +!.clangd diff --git a/README.md b/README.md new file mode 100644 index 0000000..c979324 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# Carson's C Compiler + +fuck it, we ball diff --git a/ccc.h b/ccc.h new file mode 100644 index 0000000..6b41480 --- /dev/null +++ b/ccc.h @@ -0,0 +1,12 @@ +#ifndef CCC_H +#define CCC_H +#include +#include + +#define CCC_PANIC { perror("ccc"); exit(1); } +#define CCC_ERROR(format, ...) {\ + fprintf(stderr, "line %ld: " format "\n", LINE __VA_OPT__(,) __VA_ARGS__);\ + exit(1);\ +} + +#endif diff --git a/lexer.c b/lexer.c new file mode 100644 index 0000000..04aada4 --- /dev/null +++ b/lexer.c @@ -0,0 +1,210 @@ +#include "ccc.h" +#include "lexer.h" +#include +#include + +static FILE* file = NULL; +static int lookahead; +static long LINE; + +void lexer_load(const char* path) { + if (file != NULL) { + fclose(file); + } + file = fopen(path, "r"); + if (file == NULL) CCC_PANIC; + + lookahead = fgetc(file); + LINE = 1; +} + +bool lexer_peek(struct token* p_token) { + if (file == NULL) return false; + + long orig_offset = ftell(file); + int orig_lookahead = lookahead; + bool rv = lexer_pop(p_token); + lookahead = orig_lookahead; + fseek(file, orig_offset, SEEK_SET); + return rv; +} + +#define is_whitespace(c) (c == ' ' || c == '\t' || c == '\n') +#define is_lower_alpha(c) ('a' <= c && c <= 'z') +#define is_upper_alpha(c) ('A' <= c && c <= 'Z') +#define is_alpha(c) (is_lower_alpha(c) || is_upper_alpha(c)) +#define is_numeric(c) ('0' <= c && c <= '9') +#define is_alphanumeric(c) (is_alpha(c) || is_numeric(c)) +#define is_ident_legal(c) (is_alphanumeric(c) || c == '_' || c == '$') + +#define REFUND_CHAR fseek(file, -1, SEEK_CUR) + +static int consume_char() { + int rv = lookahead; + lookahead = fgetc(file); + return rv; +} + +static void lex_ident(struct token* p_token, char ic) { + char buf[1024] = {ic}; + unsigned int len = 1; + + while (is_ident_legal(lookahead)) { + int c = consume_char(); + if (len >= sizeof(buf) - 1) + CCC_ERROR("identifier exceeds maximum size (%ld)", sizeof(buf)); + buf[len++] = c; + } + + buf[len] = 0; + *p_token = (struct token) { + .type = IDENTIFIER, + .data.identifier = strndup(buf, sizeof(buf) - 1), + }; +} + +static void lex_float_lit( + struct token* p_token, + unsigned char base, + double iv +) { + CCC_ERROR("lexer: floating point literals are not supported yet"); +} + +static void lex_int_lit(struct token* p_token, intlit_t iv) { + unsigned char base = 10; + + /* TODO: exponentiation, 2e10 f.e. */ + if (iv == 0) { + if (lookahead == 'x' || lookahead == 'X' + || lookahead == 'b' || lookahead == 'B') { + base = (lookahead == 'x' || lookahead == 'X') ? 16 : 2; + int suffix = consume_char(); + if (!is_alphanumeric(lookahead)) + CCC_ERROR( + "lexer: invalid suffix on integer constant: %c", suffix); + } else base = 8; + } + + while (is_alphanumeric(lookahead)) { + int c = consume_char(); + intlit_t c_val; + + if (is_numeric(c)) c_val = c - '0'; + else if (is_lower_alpha(c)) c_val = c - 'a' + 10; + else c_val = c - 'A' + 10; + + if (c_val >= base) + CCC_ERROR( + "lexer: invalid digit in base %hhu: %c", + base, + c); + + if (ckd_mul(&iv, iv, base)) + CCC_ERROR( + "lexer: integer literal will overflow"); + if (ckd_add(&iv, iv, c_val)) + CCC_ERROR( + "lexer: integer literal will overflow"); + } + + if (lookahead == '.') { + consume_char(); + lex_float_lit(p_token, base, iv); + return; + } + + *p_token = (struct token) { + .type = INT_LIT, + .data.int_lit = iv, + }; +} + +static void lex_char_lit(struct token* p_token) { + int c = consume_char(); + if (c == EOF) CCC_ERROR("lexer: unexpected EOF in char literal"); + + if (c == '\\') { + c = consume_char(); + if (c == EOF) CCC_ERROR("lexer: unexpected EOF in char literal"); + + if (c == '\'') c = '\''; + else if (c == '\"') c = '\"'; + else CCC_ERROR( + "lexer: escape sequences other than quotes are not supported yet"); + } + + int close_quote = consume_char(); + if (close_quote == EOF) CCC_ERROR("lexer: unexpected EOF in char literal"); + if (close_quote != '\'') + CCC_ERROR("lexer: expected \"'\", not \"%c\"", close_quote); + + *p_token = (struct token) { + .type = CHAR_LIT, + .data.char_lit = c, + }; +} + +static void lex_str_lit(struct token* p_token) { + +} + +enum token_type lex_simple(char c) { + switch (c) { + case '*': return STAR; /* TODO: *= */ + case '#': return HASHTAG; + case '(': return LPAREN; + case ')': return RPAREN; + case '{': return LCURLY; + case '}': return RCURLY; + case '[': return LSQUARE; + case ']': return RSQUARE; + case ':': return COLON; + case ';': return SEMI; + case ',': return COMMA; + case '.': return DOT; + case '?': return QMARK; + } + CCC_ERROR("lexer: unexpected token %c", c); +} + +bool lexer_pop(struct token* p_token) { + /* TODO: e.g. float f = .25; */ + if (file == NULL) return false; + + // consume all whitespace and comments preceding the next token + int c; + for (;;) { + c = consume_char(); + // one of these + if (c == EOF) return false; + else if (c == '/' && lookahead == '/') { + while (lookahead != EOF && lookahead != '\n') consume_char(); + } + else if (c == '/' && lookahead == '*') { + consume_char(); /* consume the * */ + int c = consume_char(); + while (c != EOF && (c != '*' || lookahead != '/')) + c = consume_char(); + if (c == EOF) CCC_ERROR("unterminated /* comment"); + consume_char(); /* consume the final / */ + } + else if (c == '\n') LINE++; + else if (!is_whitespace(c)) break; + } + + if (is_numeric(c)) + lex_int_lit(p_token, c - '0'); + else if (c == '.' && is_numeric(lookahead)) + lex_float_lit(p_token, 10, 0); + else if (is_ident_legal(c)) + lex_ident(p_token, c); + else if (c == '\'') + lex_char_lit(p_token); + else if (c == '"') + lex_str_lit(p_token); + else + *p_token = (struct token) {.type = lex_simple(c)}; + + return true; +} diff --git a/lexer.h b/lexer.h new file mode 100644 index 0000000..62ee9c2 --- /dev/null +++ b/lexer.h @@ -0,0 +1,72 @@ +#ifndef LEXER_H +#define LEXER_H + +enum token_type { + IDENTIFIER, + INT_LIT, + CHAR_LIT, + STR_LIT, + HASHTAG, + LPAREN, + RPAREN, + LCURLY, + RCURLY, + LSQUARE, + RSQUARE, + COLON, + SEMI, + COMMA, + DOT, + QMARK, + NOT, + NEQ, + XOR, + XEQ, + AMP, + LOG_AND, + AND_EQ, + STAR, + MUL_EQ, + NEG, + NEG_EQ, + ARROW, + ASSIGN, + TEST_EQ, + PLUS, + PLUS_EQ, + BSLASH, + PIPE, + LOG_PIPE, + PIPE_EQ, + DIV, + DIV_EQ, // comments too + LT, + GT, + LEQ, + GEQ, + SHR, + SHR_EQ, + SHL, + SHL_EQ + /* more to come */ + // ->, everything that can precede = (multi-symbols) +}; + +typedef unsigned long long intlit_t; + +struct token { + enum token_type type; + union { + char* identifier; + intlit_t int_lit; + char char_lit; + char* str_lit; + void* unused; + } data; +}; + +void lexer_load(const char* path); +bool lexer_peek(struct token* p_token); +bool lexer_pop(struct token* p_token); + +#endif diff --git a/main.c b/main.c new file mode 100644 index 0000000..e69de29 -- cgit v1.2.3