diff options
| author | Carson Fleming <cflems@cflems.net> | 2026-03-26 16:21:29 -0400 |
|---|---|---|
| committer | Carson Fleming <cflems@cflems.net> | 2026-03-26 16:22:00 -0400 |
| commit | 7d9fb2c733c8c64f6f74eefa0eea35b36be102cd (patch) | |
| tree | 16b6cded5f9611e0ff1948395578845c9688b926 | |
| parent | 68db110d34611fc8bb79035d3a11bba07dea43f3 (diff) | |
| download | ccc-7d9fb2c733c8c64f6f74eefa0eea35b36be102cd.tar.gz | |
let's go we can parse return zero most useful program ever
| -rw-r--r-- | ast.c | 90 | ||||
| -rw-r--r-- | ast.h | 77 | ||||
| -rw-r--r-- | lexer.c | 150 | ||||
| -rw-r--r-- | lexer.h | 107 | ||||
| -rw-r--r-- | main.c | 44 | ||||
| -rw-r--r-- | parser.c | 190 | ||||
| -rw-r--r-- | parser.h | 8 | ||||
| -rw-r--r-- | test/simple.c | 3 |
8 files changed, 548 insertions, 121 deletions
@@ -0,0 +1,90 @@ +#include "ast.h" +#include <stdlib.h> + +static void expr_destroy(struct expr_node* node); +static void stmt_destroy(struct stmt_node* node); + +static void type_destroy(struct type_node* node) { + free(node->type); +} + +static void var_decl_destroy(struct var_decl_node* node) { + type_destroy(&node->type); + + free(node->ident); +} + +static void group_destroy(struct group_node* node) { + struct stmt_node* body_node = node->body_head; + while (body_node != NULL) { + struct stmt_node* next = body_node->next; + stmt_destroy(body_node); + free(body_node); + body_node = next; + } +} + +static void fn_decl_destroy(struct fn_decl_node* node) { + type_destroy(&node->return_type); + + free(node->name); + + struct var_decl_node* args_node = node->args_head; + while (args_node != NULL) { + struct var_decl_node* next = args_node->next; + var_decl_destroy(args_node); + free(args_node); + args_node = next; + } + + group_destroy(&node->body); +} + +static void return_destroy(struct return_node* node) { + if (node->ret_val != NULL) { + expr_destroy(node->ret_val); + free(node->ret_val); + } +} + +static void expr_destroy(struct expr_node* node) { + switch (node->type) { + case EXPR_EMPTY: + case EXPR_INT_LIT: + break; + case EXPR_VAR_DECL: + var_decl_destroy(&node->as._var_decl); + break; + case EXPR_RETURN: + return_destroy(&node->as._return); + break; + } +} + +static void stmt_destroy(struct stmt_node* node) { + switch (node->type) { + case STMT_EXPR: + expr_destroy(&node->as._expr); + break; + case STMT_GROUP: + group_destroy(&node->as._group); + } +} + +static void root_node_destroy(struct root_node* node) { + switch (node->type) { + case ROOT_FN_DECL: + fn_decl_destroy(&node->as._fn_decl); + break; + } +} + +void ast_destroy(struct root_node* head) { + struct root_node* node = head; + while (node != NULL) { + struct root_node* next = node->next; + root_node_destroy(node); + free(node); + node = next; + } +} @@ -0,0 +1,77 @@ +#ifndef AST_H +#define AST_H + +struct stmt_node; +struct expr_node; + +struct type_node { + char* type; +}; + +struct var_decl_node { + struct type_node type; + char* ident; + + struct var_decl_node* next; +}; + +struct group_node { + struct stmt_node* body_head; +}; + +struct fn_decl_node { + struct type_node return_type; + char* name; + struct var_decl_node* args_head; + struct group_node body; +}; + +struct return_node { + struct expr_node* ret_val; /* null to return void */ +}; + +struct int_lit_node { + long long val; +}; + +struct expr_node { + enum { + EXPR_EMPTY, + EXPR_VAR_DECL, + EXPR_RETURN, + EXPR_INT_LIT, + } type; + union { + struct var_decl_node _var_decl; + struct return_node _return; + struct int_lit_node _int_lit; + } as; +}; + +struct stmt_node { + enum { + STMT_EXPR, + STMT_GROUP, + } type; + union { + struct expr_node _expr; + struct group_node _group; + } as; + + struct stmt_node* next; +}; + +struct root_node { + enum { + ROOT_FN_DECL, + } type; + union { + struct fn_decl_node _fn_decl; + } as; + + struct root_node* next; +}; + +void ast_destroy(struct root_node* head); + +#endif @@ -7,12 +7,14 @@ static FILE* file = NULL; static int lookahead; +static const char* PATH; static unsigned long LINE, COL; #define LEXER_PANIC(format, ...) {\ fprintf(\ stderr,\ - "ccc: lexer error: line %lu, column %lu: " format "\n",\ + "ccc: lexer error: %s: line %lu, column %lu: " format "\n",\ + PATH,\ LINE,\ COL __VA_OPT__(,)\ __VA_ARGS__);\ @@ -27,6 +29,7 @@ void lexer_load(const char* path) { if (file == NULL) CCC_PANIC; lookahead = fgetc(file); + PATH = path; LINE = 1; COL = 1; } @@ -42,9 +45,15 @@ bool lexer_peek(struct token* p_token) { long orig_offset = ftell(file); int orig_lookahead = lookahead; + unsigned long orig_line = LINE, orig_col = COL; + bool rv = lexer_pop(p_token); + + LINE = orig_line; + COL = orig_col; lookahead = orig_lookahead; fseek(file, orig_offset, SEEK_SET); + return rv; } @@ -79,8 +88,11 @@ static void lex_ident(struct token* p_token, char ic) { buf[len] = 0; *p_token = (struct token) { - .type = IDENTIFIER, - .data.identifier = strndup(buf, sizeof(buf) - 1), + .type = TK_IDENT, + .data.ident = strndup(buf, sizeof(buf) - 1), + .PATH = PATH, + .LINE = LINE, + .COL = COL, }; } @@ -110,8 +122,11 @@ static void lex_float_lit( } *p_token = (struct token) { - .type = FLOAT_LIT, + .type = TK_FLOAT_LIT, .data.float_lit = iv, + .PATH = PATH, + .LINE = LINE, + .COL = COL, }; } @@ -145,8 +160,11 @@ static void lex_int_lit(struct token* p_token, int_lit_t iv) { } *p_token = (struct token) { - .type = INT_LIT, + .type = TK_INT_LIT, .data.int_lit = iv, + .PATH = PATH, + .LINE = LINE, + .COL = COL, }; } @@ -185,8 +203,11 @@ static void lex_char_lit(struct token* p_token) { "expected end of char literal, not \"%c\"", close_quote); *p_token = (struct token) { - .type = CHAR_LIT, + .type = TK_CHAR_LIT, .data.char_lit = c, + .PATH = PATH, + .LINE = LINE, + .COL = COL, }; } @@ -194,8 +215,11 @@ static void lex_str_lit(struct token* p_token) { if (lookahead == '"') { consume_char(); *p_token = (struct token) { - .type = STR_LIT, + .type = TK_STR_LIT, .data.str_lit = strdup(""), + .PATH = PATH, + .LINE = LINE, + .COL = COL, }; return; } @@ -223,75 +247,83 @@ static void lex_str_lit(struct token* p_token) { buf[len] = 0; *p_token = (struct token) { - .type = STR_LIT, + .type = TK_STR_LIT, .data.str_lit = strndup(buf, sizeof(buf) - 1), + .PATH = PATH, + .LINE = LINE, + .COL = COL, }; } static enum token_type two_char_operator_type(char c) { - if (c == '!' && lookahead == '=') return NEQ; - if (c == '^' && lookahead == '=') return XEQ; - if (c == '&' && lookahead == '=') return AND_EQ; - if (c == '&' && lookahead == '&') return LOG_AND; - if (c == '*' && lookahead == '=') return MUL_EQ; - if (c == '-' && lookahead == '=') return NEG_EQ; - if (c == '-' && lookahead == '>') return ARROW; - if (c == '=' && lookahead == '=') return TEST_EQ; - if (c == '+' && lookahead == '=') return PLUS_EQ; - if (c == '|' && lookahead == '|') return LOG_PIPE; - if (c == '|' && lookahead == '=') return PIPE_EQ; - if (c == '/' && lookahead == '=') return DIV_EQ; - if (c == '%' && lookahead == '=') return MOD_EQ; - if (c == '<' && lookahead == '=') return LEQ; - if (c == '>' && lookahead == '=') return GEQ; - if (c == '<' && lookahead == '<') return SHL; - if (c == '>' && lookahead == '>') return SHR; - return NOT_FOUND; + if (c == '!' && lookahead == '=') return TK_NEQ; + if (c == '^' && lookahead == '=') return TK_XEQ; + if (c == '&' && lookahead == '=') return TK_AND_EQ; + if (c == '&' && lookahead == '&') return TK_LOG_AND; + if (c == '*' && lookahead == '=') return TK_MUL_EQ; + if (c == '-' && lookahead == '=') return TK_NEG_EQ; + if (c == '-' && lookahead == '>') return TK_ARROW; + if (c == '=' && lookahead == '=') return TK_TEST_EQ; + if (c == '+' && lookahead == '=') return TK_PLUS_EQ; + if (c == '|' && lookahead == '|') return TK_LOG_PIPE; + if (c == '|' && lookahead == '=') return TK_PIPE_EQ; + if (c == '/' && lookahead == '=') return TK_DIV_EQ; + if (c == '%' && lookahead == '=') return TK_MOD_EQ; + if (c == '<' && lookahead == '=') return TK_LEQ; + if (c == '>' && lookahead == '=') return TK_GEQ; + if (c == '<' && lookahead == '<') return TK_SHL; + if (c == '>' && lookahead == '>') return TK_SHR; + return TK_NOT_FOUND; } static bool lex_complex_operator(struct token* p_token, char c) { enum token_type type = two_char_operator_type(c); - if (type == NOT_FOUND) return false; + if (type == TK_NOT_FOUND) return false; consume_char(); - if (type == SHL && lookahead == '=') { + if (type == TK_SHL && lookahead == '=') { consume_char(); - type = SHL_EQ; + type = TK_SHL_EQ; } - if (type == SHR && lookahead == '=') { + if (type == TK_SHR && lookahead == '=') { consume_char(); - type = SHR_EQ; + type = TK_SHR_EQ; } - *p_token = (struct token) {.type = type}; + *p_token = (struct token) { + .type = type, + .PATH = PATH, + .LINE = LINE, + .COL = COL, + }; return type; } static enum token_type lex_simple_operator(char c) { switch (c) { - case '#': return HASHTAG; - case '(': return LPAREN; - case ')': return RPAREN; - case '{': return LCURLY; - case '}': return RCURLY; - case '[': return LSQUARE; - case ']': return RSQUARE; - case ':': return COLON; - case ';': return SEMI; - case ',': return COMMA; - case '.': return DOT; - case '?': return QMARK; - case '!': return NOT; - case '^': return XOR; - case '&': return AMP; - case '*': return STAR; - case '-': return NEG; - case '=': return ASSIGN; - case '+': return PLUS; - case '\\': return BSLASH; - case '|': return PIPE; - case '/': return DIV; - case '%': return MOD; - case '<': return LT; - case '>': return GT; + case '#': return TK_HASHTAG; + case '(': return TK_LPAREN; + case ')': return TK_RPAREN; + case '{': return TK_LCURLY; + case '}': return TK_RCURLY; + case '[': return TK_LSQUARE; + case ']': return TK_RSQUARE; + case ':': return TK_COLON; + case ';': return TK_SEMI; + case ',': return TK_COMMA; + case '.': return TK_DOT; + case '?': return TK_QMARK; + case '!': return TK_NOT; + case '^': return TK_XOR; + case '&': return TK_AMP; + case '*': return TK_STAR; + case '-': return TK_NEG; + case '=': return TK_ASSIGN; + case '+': return TK_PLUS; + case '\\': return TK_BSLASH; + case '|': return TK_PIPE; + case '/': return TK_DIV; + case '%': return TK_MOD; + case '<': return TK_LT; + case '>': return TK_GT; } LEXER_PANIC("unexpected token %c", c); } @@ -337,3 +369,7 @@ bool lexer_pop(struct token* p_token) { return true; } + +bool lexer_eof() { + return lookahead == EOF; +} @@ -2,56 +2,56 @@ #define LEXER_H enum token_type { - NOT_FOUND, - IDENTIFIER, - INT_LIT, - FLOAT_LIT, // TODO - CHAR_LIT, - STR_LIT, - HASHTAG, - LPAREN, - RPAREN, - LCURLY, - RCURLY, - LSQUARE, - RSQUARE, - COLON, - SEMI, - COMMA, - DOT, - QMARK, - NOT, - NEQ, - XOR, - XEQ, - AMP, - LOG_AND, - AND_EQ, - STAR, - MUL_EQ, - NEG, - NEG_EQ, - ARROW, - ASSIGN, - TEST_EQ, - PLUS, - PLUS_EQ, - BSLASH, - PIPE, - LOG_PIPE, - PIPE_EQ, - DIV, - DIV_EQ, - MOD, - MOD_EQ, - LT, - GT, - LEQ, - GEQ, - SHR, - SHR_EQ, - SHL, - SHL_EQ + TK_NOT_FOUND, + TK_IDENT, + TK_INT_LIT, + TK_FLOAT_LIT, + TK_CHAR_LIT, + TK_STR_LIT, + TK_HASHTAG, + TK_LPAREN, + TK_RPAREN, + TK_LCURLY, + TK_RCURLY, + TK_LSQUARE, + TK_RSQUARE, + TK_COLON, + TK_SEMI, + TK_COMMA, + TK_DOT, + TK_QMARK, + TK_NOT, + TK_NEQ, + TK_XOR, + TK_XEQ, + TK_AMP, + TK_LOG_AND, + TK_AND_EQ, + TK_STAR, + TK_MUL_EQ, + TK_NEG, + TK_NEG_EQ, + TK_ARROW, + TK_ASSIGN, + TK_TEST_EQ, + TK_PLUS, + TK_PLUS_EQ, + TK_BSLASH, + TK_PIPE, + TK_LOG_PIPE, + TK_PIPE_EQ, + TK_DIV, + TK_DIV_EQ, + TK_MOD, + TK_MOD_EQ, + TK_LT, + TK_GT, + TK_LEQ, + TK_GEQ, + TK_SHR, + TK_SHR_EQ, + TK_SHL, + TK_SHL_EQ }; typedef unsigned long long int_lit_t; @@ -60,18 +60,23 @@ typedef double float_lit_t; struct token { enum token_type type; union { - char* identifier; + char* ident; int_lit_t int_lit; float_lit_t float_lit; char char_lit; char* str_lit; void* unused; } data; + + const char* PATH; + unsigned long LINE; + unsigned long COL; }; void lexer_load(const char* path); void lexer_close(); bool lexer_peek(struct token* p_token); bool lexer_pop(struct token* p_token); +bool lexer_eof(); #endif @@ -1,34 +1,30 @@ #include "lexer.h" +#include "parser.h" #include <stdlib.h> #include <stdio.h> -int main(int argc, char** argv) { - if (argc < 2) { - fprintf(stderr, "ccc: no input files\n"); - return 1; - } - +void test_lexer(int argc, char** argv) { struct token token; for (int i = 1; i < argc; i++) { lexer_load(argv[i]); while (lexer_pop(&token)) { printf("[%s]: ", argv[i]); switch (token.type) { - case IDENTIFIER: - printf("got identifier: %s\n", token.data.identifier); - free(token.data.identifier); + case TK_IDENT: + printf("got identifier: %s\n", token.data.ident); + free(token.data.ident); break; - case STR_LIT: + case TK_STR_LIT: printf("got string: %s\n", token.data.str_lit); free(token.data.str_lit); break; - case INT_LIT: + case TK_INT_LIT: printf("got int: %lld\n", token.data.int_lit); break; - case FLOAT_LIT: + case TK_FLOAT_LIT: printf("got float: %lf\n", token.data.float_lit); break; - case CHAR_LIT: + case TK_CHAR_LIT: printf("got char: %c\n", token.data.char_lit); break; default: @@ -38,3 +34,25 @@ int main(int argc, char** argv) { lexer_close(); } } + +void gdb_break_here() {} + +void test_parser(int argc, char** argv) { + struct root_node* root; + struct root_node** p_cur = &root; + for (int i = 1; i < argc; i++) { + *p_cur = parse(argv[i]); + p_cur = &((*p_cur)->next); + } + gdb_break_here(); + ast_destroy(root); +} + +int main(int argc, char** argv) { + if (argc < 2) { + fprintf(stderr, "ccc: no input files\n"); + return 1; + } + + test_parser(argc, argv); +} diff --git a/parser.c b/parser.c new file mode 100644 index 0000000..590ed2b --- /dev/null +++ b/parser.c @@ -0,0 +1,190 @@ +#include "parser.h" +#include "lexer.h" +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +#define PARSER_PANIC(format, ...) {\ + fprintf(\ + stderr,\ + "ccc: parse error: %s: line %lu, column %lu: " format "\n",\ + tok.PATH,\ + tok.LINE,\ + tok.COL __VA_OPT__(,)\ + __VA_ARGS__);\ + exit(1);\ +} + +static struct token tok; + +static void* protected_alloc(size_t sz) { + void* ptr = calloc(1, sz); + if (ptr == NULL) { + fprintf(stderr, "ccc: out of memory\n"); + exit(1); + } + return ptr; +} + +static void unexpected_token(enum token_type expected) { + /* TODO: print what token was expected */ + PARSER_PANIC("unexpected token"); +} + +static void expect(enum token_type expected) { + if (!lexer_pop(&tok)) + PARSER_PANIC("unexpected EOF"); + + if (tok.type != expected) unexpected_token(expected); +} + +/* "handle" indicates that we've peeked already */ +static void handle_expr(struct expr_node* p_node); +static void handle_stmt(struct stmt_node* p_node); + +static void parse_type(struct type_node* p_node) { + expect(TK_IDENT); + /* TODO: maybe check that this type is real haha */ + p_node->type = tok.data.ident; +} + +static void parse_var_decl(struct var_decl_node* p_node) { + parse_type(&p_node->type); + expect(TK_IDENT); + p_node->ident = tok.data.ident; +} + +static void parse_group(struct group_node* p_node) { + expect(TK_LCURLY); + + struct stmt_node** pp_node = &p_node->body_head; + for (;;) { + if (!lexer_peek(&tok)) + PARSER_PANIC("unexpected EOF in statement group"); + + if (tok.type == TK_RCURLY) break; + + *pp_node = protected_alloc(sizeof(struct stmt_node)); + handle_stmt(*pp_node); + pp_node = &((*pp_node)->next); + } + + expect(TK_RCURLY); +} + +static void handle_stmt(struct stmt_node* p_node) { + if (tok.type == TK_LCURLY) { + p_node->type = STMT_GROUP; + parse_group(&p_node->as._group); + } else { + p_node->type = STMT_EXPR; + handle_expr(&p_node->as._expr); + expect(TK_SEMI); + } +} + +static void parse_arg_list(struct var_decl_node** pp_arg) { + for (;;) { + *pp_arg = protected_alloc(sizeof(struct var_decl_node)); + parse_var_decl(*pp_arg); + pp_arg = &((*pp_arg)->next); + + if (!lexer_peek(&tok)) + PARSER_PANIC("unexpected EOF in argument list"); + + if (tok.type == TK_RPAREN) break; + expect(TK_COMMA); + } +} + +static void parse_fn_decl(struct fn_decl_node* p_node) { + parse_type(&p_node->return_type); + + expect(TK_IDENT); + p_node->name = tok.data.ident; + + expect(TK_LPAREN); + + if (!lexer_peek(&tok)) + PARSER_PANIC("unexpected EOF in function declaration"); + + + if (tok.type != TK_RPAREN) parse_arg_list(&p_node->args_head); + + expect(TK_RPAREN); + + parse_group(&p_node->body); +} + +static void parse_return(struct return_node* p_node) { + expect(TK_IDENT); + if (strcmp(tok.data.ident, "return") != 0) + PARSER_PANIC("unexpected token %s; expected: return", tok.data.ident); + + if (!lexer_peek(&tok)) + PARSER_PANIC("unexpected EOF in return statement"); + + if (tok.type == TK_SEMI) { + p_node->ret_val = NULL; + return; + } + + p_node->ret_val = protected_alloc(sizeof(struct expr_node)); + handle_expr(p_node->ret_val); +} + +static void parse_int_lit(struct int_lit_node* p_node) { + expect(TK_INT_LIT); + p_node->val = tok.data.int_lit; +} + +static void handle_expr(struct expr_node* p_node) { + switch (tok.type) { + case TK_SEMI: + p_node->type = EXPR_EMPTY; + return; + case TK_IDENT: + if (strcmp(tok.data.ident, "return") == 0) { + p_node->type = EXPR_RETURN; + parse_return(&p_node->as._return); + } else { + p_node->type = EXPR_VAR_DECL; + parse_var_decl(&p_node->as._var_decl); + } + return; + case TK_INT_LIT: + p_node->type = EXPR_INT_LIT; + parse_int_lit(&p_node->as._int_lit); + return; + default: + PARSER_PANIC("expected expression"); + } +} + +static bool parse_root(struct root_node* p_node) { + if (!lexer_peek(&tok)) return false; + + p_node->type = ROOT_FN_DECL; + parse_fn_decl(&p_node->as._fn_decl); + return true; +} + +struct root_node* parse(const char* path) { + lexer_load(path); + + struct root_node* root; + struct root_node** p_node = &root; + + for (;;) { + *p_node = protected_alloc(sizeof(struct root_node)); + if (!parse_root(*p_node)) { + free(*p_node); + *p_node = NULL; + break; + } + p_node = &((*p_node)->next); + } + + lexer_close(); + return root; +} diff --git a/parser.h b/parser.h new file mode 100644 index 0000000..5449b4f --- /dev/null +++ b/parser.h @@ -0,0 +1,8 @@ +#ifndef PARSER_H +#define PARSER_H + +#include "ast.h" + +struct root_node* parse(const char* path); + +#endif diff --git a/test/simple.c b/test/simple.c new file mode 100644 index 0000000..33c14ce --- /dev/null +++ b/test/simple.c @@ -0,0 +1,3 @@ +int main() { + return 0; +} |
