diff options
| -rw-r--r-- | .gitignore | 1 | ||||
| -rw-r--r-- | ccc.h | 7 | ||||
| -rw-r--r-- | lexer.c | 176 | ||||
| -rw-r--r-- | lexer.h | 9 | ||||
| -rw-r--r-- | main.c | 39 |
5 files changed, 186 insertions, 46 deletions
@@ -1,3 +1,4 @@ +ccc *.o *.out build/** @@ -1,12 +1,7 @@ #ifndef CCC_H #define CCC_H -#include <stdio.h> -#include <stdlib.h> #define CCC_PANIC { perror("ccc"); exit(1); } -#define CCC_ERROR(format, ...) {\ - fprintf(stderr, "line %ld: " format "\n", LINE __VA_OPT__(,) __VA_ARGS__);\ - exit(1);\ -} + #endif @@ -1,11 +1,23 @@ #include "ccc.h" #include "lexer.h" +#include <stdlib.h> +#include <stdio.h> #include <string.h> #include <stdckdint.h> +#define LEXER_PANIC(format, ...) {\ + fprintf(\ + stderr,\ + "ccc: lexer error: line %lu, column %lu: " format "\n",\ + LINE,\ + COL __VA_OPT__(,)\ + __VA_ARGS__);\ + exit(1);\ +} + static FILE* file = NULL; static int lookahead; -static long LINE; +static unsigned long LINE, COL; void lexer_load(const char* path) { if (file != NULL) { @@ -16,6 +28,13 @@ void lexer_load(const char* path) { lookahead = fgetc(file); LINE = 1; + COL = 1; +} + +void lexer_close() { + if (file == NULL) return; + fclose(file); + file = NULL; } bool lexer_peek(struct token* p_token) { @@ -40,6 +59,7 @@ bool lexer_peek(struct token* p_token) { static int consume_char() { int rv = lookahead; lookahead = fgetc(file); + COL++; return rv; } @@ -50,7 +70,8 @@ static void lex_ident(struct token* p_token, char ic) { while (is_ident_legal(lookahead)) { int c = consume_char(); if (len >= sizeof(buf) - 1) - CCC_ERROR("identifier exceeds maximum size (%ld)", sizeof(buf)); + LEXER_PANIC( + "identifier exceeds maximum size (%ld)", sizeof(buf) - 1); buf[len++] = c; } @@ -66,10 +87,10 @@ static void lex_float_lit( unsigned char base, double iv ) { - CCC_ERROR("lexer: floating point literals are not supported yet"); + LEXER_PANIC("floating point literals are not implemented"); } -static void lex_int_lit(struct token* p_token, intlit_t iv) { +static void lex_int_lit(struct token* p_token, int_lit_t iv) { unsigned char base = 10; /* TODO: exponentiation, 2e10 f.e. */ @@ -79,31 +100,25 @@ static void lex_int_lit(struct token* p_token, intlit_t iv) { base = (lookahead == 'x' || lookahead == 'X') ? 16 : 2; int suffix = consume_char(); if (!is_alphanumeric(lookahead)) - CCC_ERROR( - "lexer: invalid suffix on integer constant: %c", suffix); + LEXER_PANIC("invalid suffix on integer constant: %c", suffix); } else base = 8; } while (is_alphanumeric(lookahead)) { int c = consume_char(); - intlit_t c_val; + int_lit_t c_val; if (is_numeric(c)) c_val = c - '0'; else if (is_lower_alpha(c)) c_val = c - 'a' + 10; else c_val = c - 'A' + 10; if (c_val >= base) - CCC_ERROR( - "lexer: invalid digit in base %hhu: %c", - base, - c); + LEXER_PANIC("invalid digit in base %hhu: %c", base, c); if (ckd_mul(&iv, iv, base)) - CCC_ERROR( - "lexer: integer literal will overflow"); + LEXER_PANIC("integer literal will overflow"); if (ckd_add(&iv, iv, c_val)) - CCC_ERROR( - "lexer: integer literal will overflow"); + LEXER_PANIC("integer literal will overflow"); } if (lookahead == '.') { @@ -118,24 +133,31 @@ static void lex_int_lit(struct token* p_token, intlit_t iv) { }; } +static char replace_escape_sequence(char c) { + if (c == '\'') return '\''; + else if (c == '\"') return '\"'; + else if (c == '\\') return '\\'; + else if (c == 'r') return '\r'; + else if (c == 'n') return '\n'; + else if (c == 't') return '\t'; + else LEXER_PANIC("escape sequence not implemented"); +} + static void lex_char_lit(struct token* p_token) { int c = consume_char(); - if (c == EOF) CCC_ERROR("lexer: unexpected EOF in char literal"); + if (c == EOF) LEXER_PANIC("unexpected EOF in char literal"); if (c == '\\') { c = consume_char(); - if (c == EOF) CCC_ERROR("lexer: unexpected EOF in char literal"); - - if (c == '\'') c = '\''; - else if (c == '\"') c = '\"'; - else CCC_ERROR( - "lexer: escape sequences other than quotes are not supported yet"); + if (c == EOF) LEXER_PANIC("unexpected EOF in char literal"); + c = replace_escape_sequence(c); } int close_quote = consume_char(); - if (close_quote == EOF) CCC_ERROR("lexer: unexpected EOF in char literal"); + if (close_quote == EOF) LEXER_PANIC("unexpected EOF in char literal"); if (close_quote != '\'') - CCC_ERROR("lexer: expected \"'\", not \"%c\"", close_quote); + LEXER_PANIC( + "expected end of char literal, not \"%c\"", close_quote); *p_token = (struct token) { .type = CHAR_LIT, @@ -144,17 +166,82 @@ static void lex_char_lit(struct token* p_token) { } static void lex_str_lit(struct token* p_token) { - /* TODO: impl */ + if (lookahead == '"') { + consume_char(); + *p_token = (struct token) { + .type = STR_LIT, + .data.str_lit = strdup(""), + }; + return; + } + + char buf[65536]; + unsigned int len = 0; + int c; + for (;;) { + c = consume_char(); + if (c == '"') break; + if (c == EOF || c == '\n') LEXER_PANIC("unterminated string literal"); + + if (c == '\\') { + c = consume_char(); + if (c == EOF) LEXER_PANIC("unterminated string literal"); + c = replace_escape_sequence(c); + } + + if (len >= sizeof(buf) - 1) + LEXER_PANIC( + "string literal exceeds maximum length (%ld)", + sizeof(buf) - 1); + buf[len++] = c; + } + buf[len] = 0; + + *p_token = (struct token) { + .type = STR_LIT, + .data.str_lit = strndup(buf, sizeof(buf) - 1), + }; +} + +static enum token_type two_char_operator_type(char c) { + if (c == '!' && lookahead == '=') return NEQ; + if (c == '^' && lookahead == '=') return XEQ; + if (c == '&' && lookahead == '=') return AND_EQ; + if (c == '&' && lookahead == '&') return LOG_AND; + if (c == '*' && lookahead == '=') return MUL_EQ; + if (c == '-' && lookahead == '=') return NEG_EQ; + if (c == '-' && lookahead == '>') return ARROW; + if (c == '=' && lookahead == '=') return TEST_EQ; + if (c == '+' && lookahead == '=') return PLUS_EQ; + if (c == '|' && lookahead == '|') return LOG_PIPE; + if (c == '|' && lookahead == '=') return PIPE_EQ; + if (c == '/' && lookahead == '=') return DIV_EQ; + if (c == '%' && lookahead == '=') return MOD_EQ; + if (c == '<' && lookahead == '=') return LEQ; + if (c == '>' && lookahead == '=') return GEQ; + if (c == '<' && lookahead == '<') return SHL; + if (c == '>' && lookahead == '>') return SHR; + return NOT_FOUND; } -static bool lex_complex_operator(enum token_type* p_token_type, char c) { - /* TODO: impl 2 char operators */ - return false; +static bool lex_complex_operator(struct token* p_token, char c) { + enum token_type type = two_char_operator_type(c); + if (type == NOT_FOUND) return false; + consume_char(); + if (type == SHL && lookahead == '=') { + consume_char(); + type = SHL_EQ; + } + if (type == SHR && lookahead == '=') { + consume_char(); + type = SHR_EQ; + } + *p_token = (struct token) {.type = type}; + return type; } static enum token_type lex_simple_operator(char c) { switch (c) { - case '*': return STAR; case '#': return HASHTAG; case '(': return LPAREN; case ')': return RPAREN; @@ -167,22 +254,32 @@ static enum token_type lex_simple_operator(char c) { case ',': return COMMA; case '.': return DOT; case '?': return QMARK; - /* TODO: fill in */ + case '!': return NOT; + case '^': return XOR; + case '&': return AMP; + case '*': return STAR; + case '-': return NEG; + case '=': return ASSIGN; + case '+': return PLUS; + case '\\': return BSLASH; + case '|': return PIPE; + case '/': return DIV; + case '%': return MOD; + case '<': return LT; + case '>': return GT; } - CCC_ERROR("lexer: unexpected token %c", c); + LEXER_PANIC("unexpected token %c", c); } bool lexer_pop(struct token* p_token) { - /* TODO: e.g. float f = .25; */ if (file == NULL) return false; // consume all whitespace and comments preceding the next token int c; for (;;) { c = consume_char(); - // one of these if (c == EOF) return false; - else if (c == '/' && lookahead == '/') { + else if (c == '/' && lookahead == '/') { // one of these while (lookahead != EOF && lookahead != '\n') consume_char(); } else if (c == '/' && lookahead == '*') { @@ -190,10 +287,13 @@ bool lexer_pop(struct token* p_token) { int c = consume_char(); while (c != EOF && (c != '*' || lookahead != '/')) c = consume_char(); - if (c == EOF) CCC_ERROR("unterminated /* comment"); - consume_char(); /* consume the final / */ + if (c == EOF) LEXER_PANIC("unterminated /* comment"); + consume_char(); /* consume the final slash */ + } + else if (c == '\n') { + LINE++; + COL = 1; } - else if (c == '\n') LINE++; else if (!is_whitespace(c)) break; } @@ -207,7 +307,7 @@ bool lexer_pop(struct token* p_token) { lex_char_lit(p_token); else if (c == '"') lex_str_lit(p_token); - else if (!lex_complex_operator(&p_token->type, c)) + else if (!lex_complex_operator(p_token, c)) p_token->type = lex_simple_operator(c); return true; @@ -2,8 +2,10 @@ #define LEXER_H enum token_type { + NOT_FOUND, IDENTIFIER, INT_LIT, + FLOAT_LIT, // TODO CHAR_LIT, STR_LIT, HASHTAG, @@ -52,13 +54,15 @@ enum token_type { SHL_EQ }; -typedef unsigned long long intlit_t; +typedef unsigned long long int_lit_t; +typedef double float_lit_t; struct token { enum token_type type; union { char* identifier; - intlit_t int_lit; + int_lit_t int_lit; + float_lit_t float_lit; char char_lit; char* str_lit; void* unused; @@ -66,6 +70,7 @@ struct token { }; void lexer_load(const char* path); +void lexer_close(); bool lexer_peek(struct token* p_token); bool lexer_pop(struct token* p_token); @@ -0,0 +1,39 @@ +#include "lexer.h" +#include <stdlib.h> +#include <stdio.h> + +int main(int argc, char** argv) { + if (argc < 2) { + fprintf(stderr, "ccc: no input files"); + return 1; + } + + struct token token; + for (int i = 1; i < argc; i++) { + lexer_load(argv[i]); + while (lexer_pop(&token)) { + switch (token.type) { + case IDENTIFIER: + printf("got identifier: %s\n", token.data.identifier); + free(token.data.identifier); + break; + case STR_LIT: + printf("got string: %s\n", token.data.str_lit); + free(token.data.str_lit); + break; + case INT_LIT: + printf("got int: %lld\n", token.data.int_lit); + break; + case FLOAT_LIT: + printf("got float: %lf\n", token.data.float_lit); + break; + case CHAR_LIT: + printf("got char: %c\n", token.data.char_lit); + break; + default: + printf("got simple token: %d\n", token.type); + } + } + lexer_close(); + } +} |
