summaryrefslogtreecommitdiff
path: root/lexer.c
diff options
context:
space:
mode:
authorCarson Fleming <cflems@cflems.net>2026-03-15 19:25:26 -0400
committerCarson Fleming <cflems@cflems.net>2026-03-15 19:25:26 -0400
commit66e278b0752eaa9808687c0dc214b49d7b58e8eb (patch)
treecca09f74ab439c3d54913bc1b4355b8208a5824f /lexer.c
parent1f85b418dd7960c28f16de21c44dcb4e2e05e694 (diff)
downloadccc-66e278b0752eaa9808687c0dc214b49d7b58e8eb.tar.gz
functional lexer
Diffstat (limited to 'lexer.c')
-rw-r--r--lexer.c176
1 files changed, 138 insertions, 38 deletions
diff --git a/lexer.c b/lexer.c
index 7e2f5a4..a4ffd89 100644
--- a/lexer.c
+++ b/lexer.c
@@ -1,11 +1,23 @@
#include "ccc.h"
#include "lexer.h"
+#include <stdlib.h>
+#include <stdio.h>
#include <string.h>
#include <stdckdint.h>
+#define LEXER_PANIC(format, ...) {\
+ fprintf(\
+ stderr,\
+ "ccc: lexer error: line %lu, column %lu: " format "\n",\
+ LINE,\
+ COL __VA_OPT__(,)\
+ __VA_ARGS__);\
+ exit(1);\
+}
+
static FILE* file = NULL;
static int lookahead;
-static long LINE;
+static unsigned long LINE, COL;
void lexer_load(const char* path) {
if (file != NULL) {
@@ -16,6 +28,13 @@ void lexer_load(const char* path) {
lookahead = fgetc(file);
LINE = 1;
+ COL = 1;
+}
+
+void lexer_close() {
+ if (file == NULL) return;
+ fclose(file);
+ file = NULL;
}
bool lexer_peek(struct token* p_token) {
@@ -40,6 +59,7 @@ bool lexer_peek(struct token* p_token) {
static int consume_char() {
int rv = lookahead;
lookahead = fgetc(file);
+ COL++;
return rv;
}
@@ -50,7 +70,8 @@ static void lex_ident(struct token* p_token, char ic) {
while (is_ident_legal(lookahead)) {
int c = consume_char();
if (len >= sizeof(buf) - 1)
- CCC_ERROR("identifier exceeds maximum size (%ld)", sizeof(buf));
+ LEXER_PANIC(
+ "identifier exceeds maximum size (%ld)", sizeof(buf) - 1);
buf[len++] = c;
}
@@ -66,10 +87,10 @@ static void lex_float_lit(
unsigned char base,
double iv
) {
- CCC_ERROR("lexer: floating point literals are not supported yet");
+ LEXER_PANIC("floating point literals are not implemented");
}
-static void lex_int_lit(struct token* p_token, intlit_t iv) {
+static void lex_int_lit(struct token* p_token, int_lit_t iv) {
unsigned char base = 10;
/* TODO: exponentiation, 2e10 f.e. */
@@ -79,31 +100,25 @@ static void lex_int_lit(struct token* p_token, intlit_t iv) {
base = (lookahead == 'x' || lookahead == 'X') ? 16 : 2;
int suffix = consume_char();
if (!is_alphanumeric(lookahead))
- CCC_ERROR(
- "lexer: invalid suffix on integer constant: %c", suffix);
+ LEXER_PANIC("invalid suffix on integer constant: %c", suffix);
} else base = 8;
}
while (is_alphanumeric(lookahead)) {
int c = consume_char();
- intlit_t c_val;
+ int_lit_t c_val;
if (is_numeric(c)) c_val = c - '0';
else if (is_lower_alpha(c)) c_val = c - 'a' + 10;
else c_val = c - 'A' + 10;
if (c_val >= base)
- CCC_ERROR(
- "lexer: invalid digit in base %hhu: %c",
- base,
- c);
+ LEXER_PANIC("invalid digit in base %hhu: %c", base, c);
if (ckd_mul(&iv, iv, base))
- CCC_ERROR(
- "lexer: integer literal will overflow");
+ LEXER_PANIC("integer literal will overflow");
if (ckd_add(&iv, iv, c_val))
- CCC_ERROR(
- "lexer: integer literal will overflow");
+ LEXER_PANIC("integer literal will overflow");
}
if (lookahead == '.') {
@@ -118,24 +133,31 @@ static void lex_int_lit(struct token* p_token, intlit_t iv) {
};
}
+static char replace_escape_sequence(char c) {
+ if (c == '\'') return '\'';
+ else if (c == '\"') return '\"';
+ else if (c == '\\') return '\\';
+ else if (c == 'r') return '\r';
+ else if (c == 'n') return '\n';
+ else if (c == 't') return '\t';
+ else LEXER_PANIC("escape sequence not implemented");
+}
+
static void lex_char_lit(struct token* p_token) {
int c = consume_char();
- if (c == EOF) CCC_ERROR("lexer: unexpected EOF in char literal");
+ if (c == EOF) LEXER_PANIC("unexpected EOF in char literal");
if (c == '\\') {
c = consume_char();
- if (c == EOF) CCC_ERROR("lexer: unexpected EOF in char literal");
-
- if (c == '\'') c = '\'';
- else if (c == '\"') c = '\"';
- else CCC_ERROR(
- "lexer: escape sequences other than quotes are not supported yet");
+ if (c == EOF) LEXER_PANIC("unexpected EOF in char literal");
+ c = replace_escape_sequence(c);
}
int close_quote = consume_char();
- if (close_quote == EOF) CCC_ERROR("lexer: unexpected EOF in char literal");
+ if (close_quote == EOF) LEXER_PANIC("unexpected EOF in char literal");
if (close_quote != '\'')
- CCC_ERROR("lexer: expected \"'\", not \"%c\"", close_quote);
+ LEXER_PANIC(
+ "expected end of char literal, not \"%c\"", close_quote);
*p_token = (struct token) {
.type = CHAR_LIT,
@@ -144,17 +166,82 @@ static void lex_char_lit(struct token* p_token) {
}
static void lex_str_lit(struct token* p_token) {
- /* TODO: impl */
+ if (lookahead == '"') {
+ consume_char();
+ *p_token = (struct token) {
+ .type = STR_LIT,
+ .data.str_lit = strdup(""),
+ };
+ return;
+ }
+
+ char buf[65536];
+ unsigned int len = 0;
+ int c;
+ for (;;) {
+ c = consume_char();
+ if (c == '"') break;
+ if (c == EOF || c == '\n') LEXER_PANIC("unterminated string literal");
+
+ if (c == '\\') {
+ c = consume_char();
+ if (c == EOF) LEXER_PANIC("unterminated string literal");
+ c = replace_escape_sequence(c);
+ }
+
+ if (len >= sizeof(buf) - 1)
+ LEXER_PANIC(
+ "string literal exceeds maximum length (%ld)",
+ sizeof(buf) - 1);
+ buf[len++] = c;
+ }
+ buf[len] = 0;
+
+ *p_token = (struct token) {
+ .type = STR_LIT,
+ .data.str_lit = strndup(buf, sizeof(buf) - 1),
+ };
+}
+
+static enum token_type two_char_operator_type(char c) {
+ if (c == '!' && lookahead == '=') return NEQ;
+ if (c == '^' && lookahead == '=') return XEQ;
+ if (c == '&' && lookahead == '=') return AND_EQ;
+ if (c == '&' && lookahead == '&') return LOG_AND;
+ if (c == '*' && lookahead == '=') return MUL_EQ;
+ if (c == '-' && lookahead == '=') return NEG_EQ;
+ if (c == '-' && lookahead == '>') return ARROW;
+ if (c == '=' && lookahead == '=') return TEST_EQ;
+ if (c == '+' && lookahead == '=') return PLUS_EQ;
+ if (c == '|' && lookahead == '|') return LOG_PIPE;
+ if (c == '|' && lookahead == '=') return PIPE_EQ;
+ if (c == '/' && lookahead == '=') return DIV_EQ;
+ if (c == '%' && lookahead == '=') return MOD_EQ;
+ if (c == '<' && lookahead == '=') return LEQ;
+ if (c == '>' && lookahead == '=') return GEQ;
+ if (c == '<' && lookahead == '<') return SHL;
+ if (c == '>' && lookahead == '>') return SHR;
+ return NOT_FOUND;
}
-static bool lex_complex_operator(enum token_type* p_token_type, char c) {
- /* TODO: impl 2 char operators */
- return false;
+static bool lex_complex_operator(struct token* p_token, char c) {
+ enum token_type type = two_char_operator_type(c);
+ if (type == NOT_FOUND) return false;
+ consume_char();
+ if (type == SHL && lookahead == '=') {
+ consume_char();
+ type = SHL_EQ;
+ }
+ if (type == SHR && lookahead == '=') {
+ consume_char();
+ type = SHR_EQ;
+ }
+ *p_token = (struct token) {.type = type};
+ return type;
}
static enum token_type lex_simple_operator(char c) {
switch (c) {
- case '*': return STAR;
case '#': return HASHTAG;
case '(': return LPAREN;
case ')': return RPAREN;
@@ -167,22 +254,32 @@ static enum token_type lex_simple_operator(char c) {
case ',': return COMMA;
case '.': return DOT;
case '?': return QMARK;
- /* TODO: fill in */
+ case '!': return NOT;
+ case '^': return XOR;
+ case '&': return AMP;
+ case '*': return STAR;
+ case '-': return NEG;
+ case '=': return ASSIGN;
+ case '+': return PLUS;
+ case '\\': return BSLASH;
+ case '|': return PIPE;
+ case '/': return DIV;
+ case '%': return MOD;
+ case '<': return LT;
+ case '>': return GT;
}
- CCC_ERROR("lexer: unexpected token %c", c);
+ LEXER_PANIC("unexpected token %c", c);
}
bool lexer_pop(struct token* p_token) {
- /* TODO: e.g. float f = .25; */
if (file == NULL) return false;
// consume all whitespace and comments preceding the next token
int c;
for (;;) {
c = consume_char();
- // one of these
if (c == EOF) return false;
- else if (c == '/' && lookahead == '/') {
+ else if (c == '/' && lookahead == '/') { // one of these
while (lookahead != EOF && lookahead != '\n') consume_char();
}
else if (c == '/' && lookahead == '*') {
@@ -190,10 +287,13 @@ bool lexer_pop(struct token* p_token) {
int c = consume_char();
while (c != EOF && (c != '*' || lookahead != '/'))
c = consume_char();
- if (c == EOF) CCC_ERROR("unterminated /* comment");
- consume_char(); /* consume the final / */
+ if (c == EOF) LEXER_PANIC("unterminated /* comment");
+ consume_char(); /* consume the final slash */
+ }
+ else if (c == '\n') {
+ LINE++;
+ COL = 1;
}
- else if (c == '\n') LINE++;
else if (!is_whitespace(c)) break;
}
@@ -207,7 +307,7 @@ bool lexer_pop(struct token* p_token) {
lex_char_lit(p_token);
else if (c == '"')
lex_str_lit(p_token);
- else if (!lex_complex_operator(&p_token->type, c))
+ else if (!lex_complex_operator(p_token, c))
p_token->type = lex_simple_operator(c);
return true;