Initial commit, basic lexer structure

2025-03-30 17:45:51 +02:00
commit df948b18c6
13 changed files with 794 additions and 0 deletions
--- a/.clang-format
+++ b/.clang-format
@@ -0,0 +1,5 @@
+BasedOnStyle:    LLVM
+IndentWidth:     4
+Cpp11BracedListStyle: true
+AlignArrayOfStructures: Left
+AllowShortFunctionsOnASingleLine: Empty
--- a/.clangd
+++ b/.clangd
@@ -0,0 +1,2 @@
+CompileFlags:
+  Add: ["-std=c23", "-x", "c"]
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,7 @@
+*.o
+*.d
+/core
+/oas
+/oas-asan
+/oas-msan
+/reports
--- a/19
+++ b/19
@@ -0,0 +1,19 @@
+Copyright (c) 2025 omicron <omicron.me@protonmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/42
+++ b/42
@@ -0,0 +1,42 @@
+.PHONY: all clean clean-objects run sanitize validate
+
+CC=clang
+LD=clang
+CFLAGS=-Wall -Wextra -Wpedantic -O0 -g3 -std=c23 -fno-omit-frame-pointer -fno-optimize-sibling-calls
+LDFLAGS?=
+
+SOURCES = $(shell find src/ -type f -name '*.c')
+OBJECTS = $(SOURCES:.c=.o)
+DEPENDENCIES = $(SOURCES:.c=.d)
+TARGET?=oas
+OUTPUTS=oas oas-asan oas-msan
+RUNARGUMENTS=-tokens test.asm
+
+all: $(TARGET)
+	
+
+run: $(TARGET)
+	./$(TARGET) $(RUNARGUMENTS)
+
+sanitize:
+	make CFLAGS="$(CFLAGS) -fsanitize=address,undefined" LDFLAGS="-fsanitize=address,undefined" TARGET="oas-asan" clean-objects all
+	make CFLAGS="$(CFLAGS) -fsanitize=memory -fsanitize-memory-track-origins=2" LDFLAGS="-fsanitize=memory -fsanitize-memory-track-origins=2" TARGET="oas-msan" clean-objects all 
+	make clean-objects
+
+validate:
+	./validate.sh
+
+$(TARGET): $(OBJECTS)
+	$(LD) $(LDFLAGS) -o $@ $^
+
+%.o: %.c
+	$(CC) $(CFLAGS) -MMD -MP -c $< -o $@
+
+-include $(DEPENDENCIES)
+
+clean-objects:
+	rm -f $(OBJECTS) $(DEPENDENCIES)
+
+clean: clean-objects
+	rm -f $(TARGET) $(OUTPUTS)
+	rm -rf reports/
--- a/doc/lexer_grammar.txt
+++ b/doc/lexer_grammar.txt
@@ -0,0 +1,46 @@
+/* These non-terminals are the actual tokens the lexer emits */
+<identifier>  ::= <identifier_start> <identifier_character>+
+<decimal>     ::= [0-9]+
+
+<hexadecimal> ::= "0x" <hex_digit>+ <number_suffix>?
+<binary>      ::= "0b" [0-1]+ <number_suffix>?
+<octal>       ::= "0o" [0-7]+ <number_suffix>?
+<string>      ::= "\"" <string_unit>+ "\""
+<character>   ::= "'" <character_unit> "'"
+<colon>       ::= ":"
+<comma>       ::= ","
+<lbracket>    ::= "["
+<rbracket>    ::= "]"
+<plus>        ::= "+"
+<minus>       ::= "-"
+<asterisk>    ::= "*"
+<dot>         ::= "."
+<comment>     ::= ";" <comment_character>*
+<newline>     ::= "\r"? "\n"
+<whitespace>  ::= ( " " | "\t" )+
+
+/* helper non-terminals to make it easier to define the tokens */
+<number_suffix> ::= ":" ( "8" | "16" | "32" | "64" )
+
+<identifier_start> ::= [a-z] | [A-Z] | "_"
+<identifier_character> ::= [a-z] | [A-Z] | [0-9] | "_"
+
+<hex_digit> ::= [a-f] | [A-F]
+
+<string_unit> ::= <string_regular> | <escaped>
+<character_unit> ::= <character_regular> | <escaped>
+
+<escaped> ::= "\\" ( <escape_list> | <escape_hex> )
+<escape_list> ::= "\\" | "n" | "r" | "t" | "0" | "\"" | "'"
+<escape_hex> ::= "x" <hex_digit> <hex_digit>
+
+/* alternative definitions to support bnfplayground, use the ones below instead */
+<comment_character> ::= <shared_regular> | "'" | "\""
+<string_regular> ::= <shared_regular> | "'"
+<character_regular> ::= <shared_regular> | "\""
+<shared_regular> ::= [a-z] | [A-Z] | [0-9] | " " | "+" | "-" | "#" | "\t" | "_" | "$" | "&" | "{" | "}" | "(" | ")" | "|"
+
+/* actual definition we're implementing */
+/* <comment_character> ::= [^\r\n] */
+/* <character_regular> ::= [^\\'] */
+/* <string_regular> ::= [^\\"] */
--- a/src/error.c
+++ b/src/error.c
@@ -0,0 +1,42 @@
+#include "error.h"
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+error_t *const err_errorf_alloc = &(error_t){
+    .message = "Allocation failed during formatting of another error"};
+error_t *const err_errorf_length = &(error_t){
+    .message =
+        "Formatting of another error failed to determine the error length"};
+
+error_t *errorf(const char *fmt, ...) {
+    error_t *err = calloc(1, sizeof(error_t));
+    if (err == nullptr)
+        return err_errorf_alloc;
+
+    va_list args;
+    va_list args_count;
+    va_start(args, fmt);
+    va_copy(args_count, args);
+
+    int size = vsnprintf(nullptr, 0, fmt, args_count) + 1;
+    va_end(args_count);
+    if (size <= 0) {
+        free(err);
+        va_end(args);
+        return err_errorf_length;
+    }
+
+    err->message = malloc(size);
+    if (err->message == nullptr) {
+        free(err);
+        va_end(args);
+        return err_errorf_alloc;
+    }
+
+    vsnprintf(err->message, size, fmt, args);
+    va_end(args);
+    err->is_heap_allocated = true;
+    return err;
+}
--- a/src/error.h
+++ b/src/error.h
@@ -0,0 +1,21 @@
+#ifndef INCLUDE_SRC_ERROR_H_
+#define INCLUDE_SRC_ERROR_H_
+
+#include <stdlib.h>
+
+typedef struct error {
+    char *message;
+    bool is_heap_allocated;
+} error_t;
+
+error_t *errorf(const char *fmt, ...);
+static inline void error_free(error_t *err) {
+    if (err == nullptr)
+        return;
+    if (!err->is_heap_allocated)
+        return;
+    free(err->message);
+    free(err);
+}
+
+#endif // INCLUDE_SRC_ERROR_H_
--- a/src/lexer.c
+++ b/src/lexer.c
@@ -0,0 +1,465 @@
+#include "lexer.h"
+#include "error.h"
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <string.h>
+
+error_t *err_lexer_already_open = &(error_t){
+    .message =
+        "Can't open on a lexer object that is already opened. Close it first."};
+error_t *err_prefix_too_large =
+    &(error_t){.message = "Prefix too large for internal lexer buffer"};
+error_t *err_buffer_underrun = &(error_t){
+    .message = "Buffer does not contain enough characters for lexer_consume_n"};
+error_t *err_consume_excessive_length =
+    &(error_t){.message = "Too many valid characters to consume"};
+
+error_t *err_eof =
+    &(error_t){.message = "Can't read from file because EOF is reached"};
+
+error_t *err_unknown_read = &(error_t){.message = "Unknown read error"};
+
+error_t *err_allocation_failed =
+    &(error_t){.message = "Memory allocation failed"};
+
+typedef bool (*char_predicate_t)(char);
+
+const char *lexer_token_id_to_cstr(lexer_token_id_t id) {
+    switch (id) {
+    case TOKEN_ERROR:
+        return "TOKEN_ERROR";
+    case TOKEN_IDENTIFIER:
+        return "TOKEN_IDENTIFIER";
+    case TOKEN_DECIMAL:
+        return "TOKEN_DECIMAL";
+    case TOKEN_HEXADECIMAL:
+        return "TOKEN_HEXADECIMAL";
+    case TOKEN_OCTAL:
+        return "TOKEN_OCTAL";
+    case TOKEN_BINARY:
+        return "TOKEN_BINARY";
+    case TOKEN_CHAR:
+        return "TOKEN_CHAR";
+    case TOKEN_STRING:
+        return "TOKEN_STRING";
+    case TOKEN_COLON:
+        return "TOKEN_COLON";
+    case TOKEN_COMMA:
+        return "TOKEN_COMMA";
+    case TOKEN_LBRACKET:
+        return "TOKEN_LBRACKET";
+    case TOKEN_RBRACKET:
+        return "TOKEN_RBRACKET";
+    case TOKEN_PLUS:
+        return "TOKEN_PLUS";
+    case TOKEN_MINUS:
+        return "TOKEN_MINUS";
+    case TOKEN_ASTERISK:
+        return "TOKEN_ASTERISK";
+    case TOKEN_DOT:
+        return "TOKEN_DOT";
+    case TOKEN_COMMENT:
+        return "TOKEN_COMMENT";
+    case TOKEN_NEWLINE:
+        return "TOKEN_NEWLINE";
+    case TOKEN_WHITESPACE:
+        return "TOKEN_WHITESPACE";
+    }
+    assert(!"Unreachable, weird token id" && id);
+    __builtin_unreachable();
+}
+
+void lexer_token_print(lexer_token_t *token) {
+    printf("(%zu, %zu) %s[%d]%s%s\n", token->line_number,
+           token->character_number, lexer_token_id_to_cstr(token->id),
+           token->id, token->value ? ": " : "",
+           token->value ? token->value : "");
+    if (token->id == TOKEN_ERROR)
+        printf("  `--> %s\n", token->explanation);
+}
+
+void lexer_token_cleanup(lexer_token_t *token) {
+    free(token->value);
+    memset(token, 0, sizeof(lexer_token_t));
+}
+
+void lexer_close(lexer_t *lex) {
+    fclose(lex->fp);
+    memset(lex, 0, sizeof(lexer_t));
+}
+
+error_t *lexer_fill_buffer(lexer_t *lex) {
+    if (feof(lex->fp) && lex->buffer_count == 0)
+        return err_eof;
+    if (feof(lex->fp))
+        return nullptr;
+    if (lex->buffer_count == lexer_buffer_size)
+        return nullptr;
+
+    size_t remaining = lexer_buffer_size - lex->buffer_count;
+    while (remaining > 0) {
+        char *buffer = lex->buffer + lex->buffer_count;
+        size_t n = fread(buffer, 1, remaining, lex->fp);
+        if (n == 0 && feof(lex->fp))
+            break;
+        if (n == 0 && ferror(lex->fp))
+            return errorf("Read error: %s", strerror(errno));
+        if (n == 0)
+            return err_unknown_read;
+        remaining -= n;
+        lex->buffer_count += n;
+    }
+    return nullptr;
+}
+
+error_t *lexer_open(lexer_t *lex, char *path) {
+    if (lex->fp != nullptr)
+        return err_lexer_already_open;
+
+    lex->fp = fopen(path, "rb");
+    if (lex->fp == nullptr)
+        return errorf("Failed to open file '%s': %s", path, strerror(errno));
+    lex->line_number = 0;
+    lex->character_number = 0;
+    lex->buffer_count = 0;
+    return nullptr;
+}
+
+void lexer_shift_buffer(lexer_t *lex, int n) {
+    lex->buffer_count -= n;
+    memmove(lex->buffer, lex->buffer + n, lex->buffer_count);
+}
+
+error_t *lexer_peek(lexer_t *lex, char *c) {
+    error_t *err = lexer_fill_buffer(lex);
+    if (err)
+        return err;
+    if (lex->buffer_count == 0)
+        return err_eof;
+    *c = lex->buffer[0];
+    lexer_shift_buffer(lex, 1);
+    return nullptr;
+}
+
+// This does _not_ fill the internal lexer buffer and you _must_ call
+// lexer_fill_buffer() before calling this. It will always return false if your
+// prefix is larger than lexer_buffer_size
+bool lexer_has_prefix(lexer_t *lex, char *prefix) {
+    size_t len = strlen(prefix);
+    if (len > lex->buffer_count)
+        return false;
+    return memcmp(lex->buffer, prefix, len) == 0;
+}
+
+error_t *lexer_not_implemented(lexer_t *lex, lexer_token_t *token) {
+    (void)token;
+    return errorf("Not implemented, character %02x (%c) at (%zu, %zu).\n",
+                  lex->buffer[0], lex->buffer[0], lex->line_number,
+                  lex->character_number);
+}
+
+error_t *lexer_consume_n(lexer_t *lex, const size_t len,
+                         char buffer[static len], const size_t n) {
+    if (lex->buffer_count < n)
+        return err_buffer_underrun;
+    if (len > n)
+        return err_consume_excessive_length;
+
+    memcpy(buffer, lex->buffer, n);
+    lexer_shift_buffer(lex, n);
+    return nullptr;
+}
+error_t *lexer_consume(lexer_t *lex, const size_t n, char buffer[static n],
+                       char_predicate_t is_valid, size_t *n_consumed) {
+    const size_t buffer_size = n;
+    bool have_more_characters = false;
+    *n_consumed = 0;
+    do {
+        size_t i = 0;
+        while (i < lex->buffer_count && i < buffer_size - *n_consumed &&
+               is_valid(lex->buffer[i])) {
+            ++i;
+        }
+        memcpy(buffer + *n_consumed, lex->buffer, i);
+        lexer_shift_buffer(lex, i);
+        *n_consumed += i;
+
+        error_t *err = lexer_fill_buffer(lex);
+        if (err == err_eof)
+            have_more_characters = false;
+        else if (err)
+            return err;
+        else
+            have_more_characters =
+                (lex->buffer_count > 0 && is_valid(lex->buffer[0]));
+
+        if (have_more_characters && *n_consumed == buffer_size) {
+            return err_consume_excessive_length;
+        }
+    } while (have_more_characters);
+    return nullptr;
+}
+
+bool is_hexadecimal_character(char c) {
+    return isdigit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
+}
+
+bool is_octal_character(char c) {
+    return c >= '0' && c <= '7';
+}
+
+bool is_binary_character(char c) {
+    return c == '0' || c == '1';
+}
+
+bool is_decimal_character(char c) {
+    return isdigit(c);
+}
+
+error_t *lexer_next_number(lexer_t *lex, lexer_token_t *token) {
+    constexpr size_t max_number_length = 128;
+    size_t so_far = 0;
+    size_t n = 0;
+    char buffer[max_number_length + 1] = {};
+
+    token->line_number = lex->line_number;
+    token->character_number = lex->character_number;
+    char_predicate_t is_valid;
+    if (lexer_has_prefix(lex, "0x")) {
+        is_valid = is_hexadecimal_character;
+        token->id = TOKEN_HEXADECIMAL;
+        strcpy(buffer, "0x");
+        so_far = 2;
+    } else if (lexer_has_prefix(lex, "0o")) {
+        is_valid = is_octal_character;
+        token->id = TOKEN_OCTAL;
+        strcpy(buffer, "0o");
+        so_far = 2;
+    } else if (lexer_has_prefix(lex, "0b")) {
+        token->id = TOKEN_BINARY;
+        is_valid = is_binary_character;
+        strcpy(buffer, "0b");
+        so_far = 2;
+    } else {
+        token->id = TOKEN_DECIMAL;
+        is_valid = is_decimal_character;
+        so_far = 0;
+    }
+    if (so_far > 0) {
+        lex->character_number += so_far;
+        lexer_shift_buffer(lex, so_far);
+    }
+
+    error_t *err = lexer_consume(lex, max_number_length - so_far,
+                                 buffer + so_far, is_valid, &n);
+    if (err == err_consume_excessive_length) {
+        token->id = TOKEN_ERROR;
+        token->explanation =
+            "Number length exceeds the maximum of 128 characters";
+    }
+    so_far += n;
+    if (n == 0) {
+        token->id = TOKEN_ERROR;
+        token->explanation = "Invalid number format";
+    }
+
+    err = lexer_fill_buffer(lex);
+    if (err != err_eof && err) {
+        return err;
+    }
+
+    size_t suffix_length = 0;
+    if (lexer_has_prefix(lex, ":8")) {
+        suffix_length = 2;
+    } else if (lexer_has_prefix(lex, ":16")) {
+        suffix_length = 3;
+    } else if (lexer_has_prefix(lex, ":32")) {
+        suffix_length = 3;
+    } else if (lexer_has_prefix(lex, ":64")) {
+        suffix_length = 3;
+    }
+
+    if (suffix_length > 0) {
+        err = lexer_consume_n(lex, max_number_length - so_far, buffer + so_far,
+                              suffix_length);
+        if (err == err_consume_excessive_length) {
+            token->id = TOKEN_ERROR;
+            token->explanation =
+                "Number length exceeds the maximum of 128 characters";
+        }
+    }
+
+    lex->character_number += n;
+    token->value = strdup(buffer);
+    return nullptr;
+}
+error_t *lexer_next_newline(lexer_t *lex, lexer_token_t *token) {
+    token->line_number = lex->line_number;
+    token->character_number = lex->character_number;
+    token->id = TOKEN_NEWLINE;
+
+    if (lexer_has_prefix(lex, "\r\n")) {
+        lexer_shift_buffer(lex, 2);
+        token->value = strdup("\r\n");
+        lex->character_number = 0;
+        lex->line_number += 1;
+    } else if (lexer_has_prefix(lex, "\n")) {
+        lexer_shift_buffer(lex, 1);
+        token->value = strdup("\n");
+        lex->character_number = 0;
+        lex->line_number += 1;
+    } else {
+        token->id = TOKEN_ERROR;
+        lex->character_number += 1;
+        token->value = strdup((char[]){lex->buffer[0]});
+        token->explanation = "Invalid newline format";
+    }
+    return nullptr;
+}
+
+bool is_identifier_character(char c) {
+    return isalnum(c) || c == '_';
+}
+
+error_t *lexer_next_identifier(lexer_t *lex, lexer_token_t *token) {
+    constexpr size_t max_identifier_length = 128;
+    size_t n = 0;
+    char buffer[max_identifier_length + 1] = {};
+
+    token->id = TOKEN_IDENTIFIER;
+    token->line_number = lex->line_number;
+    token->character_number = lex->character_number;
+
+    error_t *err = lexer_consume(lex, max_identifier_length, buffer,
+                                 is_identifier_character, &n);
+    if (err == err_consume_excessive_length) {
+        token->id = TOKEN_ERROR;
+        token->explanation =
+            "Identifier length exceeds the maximum of 128 characters";
+    }
+    lex->character_number += n;
+    token->value = strdup(buffer);
+    return nullptr;
+}
+
+error_t *lexer_next_character(lexer_t *lex, lexer_token_t *token) {
+    return lexer_not_implemented(lex, token);
+}
+error_t *lexer_next_string(lexer_t *lex, lexer_token_t *token) {
+    return lexer_not_implemented(lex, token);
+}
+
+bool is_whitespace_character(char c) {
+    return c == ' ' || c == '\t';
+}
+
+error_t *lexer_next_whitespace(lexer_t *lex, lexer_token_t *token) {
+    constexpr size_t max_whitespace_length = 1024;
+    size_t n = 0;
+    char buffer[max_whitespace_length + 1] = {};
+
+    token->id = TOKEN_WHITESPACE;
+    token->line_number = lex->line_number;
+    token->character_number = lex->character_number;
+
+    error_t *err = lexer_consume(lex, max_whitespace_length, buffer,
+                                 is_whitespace_character, &n);
+    if (err == err_consume_excessive_length) {
+        token->id = TOKEN_ERROR;
+        token->explanation =
+            "Whitespace length exceeds the maximum of 1024 characters";
+    }
+    lex->character_number += n;
+    token->value = strdup(buffer);
+    return nullptr;
+}
+
+bool is_comment_character(char c) {
+    return c != '\r' && c != '\n';
+}
+
+error_t *lexer_next_comment(lexer_t *lex, lexer_token_t *token) {
+    constexpr size_t max_comment_length = 1024;
+    size_t n = 0;
+    char buffer[max_comment_length + 1] = {};
+
+    token->id = TOKEN_COMMENT;
+    token->line_number = lex->line_number;
+    token->character_number = lex->character_number;
+
+    error_t *err = lexer_consume(lex, max_comment_length, buffer,
+                                 is_comment_character, &n);
+    if (err == err_consume_excessive_length) {
+        token->id = TOKEN_ERROR;
+        token->explanation =
+            "Comment length exceeds the maximum of 1024 characters";
+    }
+    lex->character_number += n;
+    token->value = strdup(buffer);
+    return nullptr;
+}
+
+error_t *lexer_next(lexer_t *lex, lexer_token_t *token) {
+    memset(token, 0, sizeof(lexer_token_t));
+    error_t *err = lexer_fill_buffer(lex);
+    if (err)
+        return err;
+    char first = lex->buffer[0];
+    if (isalpha(first) || first == '_')
+        return lexer_next_identifier(lex, token);
+    if (isdigit(first))
+        return lexer_next_number(lex, token);
+
+    switch (first) {
+    case '\'':
+        return lexer_next_character(lex, token);
+    case '"':
+        return lexer_next_string(lex, token);
+    case ' ':
+    case '\t':
+        return lexer_next_whitespace(lex, token);
+    case ';':
+        return lexer_next_comment(lex, token);
+    case ':':
+        token->id = TOKEN_COLON;
+        break;
+    case ',':
+        token->id = TOKEN_COMMA;
+        break;
+    case '[':
+        token->id = TOKEN_LBRACKET;
+        break;
+    case ']':
+        token->id = TOKEN_RBRACKET;
+        break;
+    case '+':
+        token->id = TOKEN_PLUS;
+        break;
+    case '-':
+        token->id = TOKEN_MINUS;
+        break;
+    case '*':
+        token->id = TOKEN_ASTERISK;
+        break;
+    case '.':
+        token->id = TOKEN_DOT;
+        break;
+    case '\r':
+    case '\n':
+        return lexer_next_newline(lex, token);
+    default:
+        token->id = TOKEN_ERROR;
+        break;
+    }
+    token->value = strdup((char[]){first, 0});
+    lexer_shift_buffer(lex, 1);
+    token->line_number = lex->line_number;
+    token->character_number = lex->character_number;
+    if (token->id == TOKEN_ERROR) {
+        token->explanation =
+            "unexpected character during lexing (first of token)";
+    }
+    lex->character_number += 1;
+    return nullptr;
+}
--- a/src/lexer.h
+++ b/src/lexer.h
@@ -0,0 +1,56 @@
+#ifndef INCLUDE_SRC_LEXER_H_
+#define INCLUDE_SRC_LEXER_H_
+
+#include "error.h"
+#include <stddef.h>
+#include <stdio.h>
+
+extern error_t *err_eof;
+
+typedef enum {
+    TOKEN_ERROR,
+    TOKEN_IDENTIFIER,
+    TOKEN_DECIMAL,
+    TOKEN_HEXADECIMAL,
+    TOKEN_OCTAL,
+    TOKEN_BINARY,
+    TOKEN_CHAR,
+    TOKEN_STRING,
+    TOKEN_COLON,
+    TOKEN_COMMA,
+    TOKEN_LBRACKET,
+    TOKEN_RBRACKET,
+    TOKEN_PLUS,
+    TOKEN_MINUS,
+    TOKEN_ASTERISK,
+    TOKEN_DOT,
+    TOKEN_COMMENT,
+    TOKEN_NEWLINE,
+    TOKEN_WHITESPACE,
+} lexer_token_id_t;
+
+typedef struct lexer_token {
+    lexer_token_id_t id;
+    size_t line_number;
+    size_t character_number;
+    char *value;
+    const char *explanation;
+} lexer_token_t;
+
+constexpr size_t lexer_buffer_size = 32;
+
+typedef struct lexer {
+    size_t line_number;
+    size_t character_number;
+    size_t buffer_count;
+    char buffer[lexer_buffer_size];
+    FILE *fp;
+} lexer_t;
+
+void lexer_close(lexer_t *lex);
+error_t *lexer_open(lexer_t *lex, char *path);
+error_t *lexer_next(lexer_t *lex, lexer_token_t *token);
+void lexer_token_print(lexer_token_t *token);
+void lexer_token_cleanup(lexer_token_t *token);
+
+#endif // INCLUDE_SRC_LEXER_H_
--- a/src/main.c
+++ b/src/main.c
@@ -0,0 +1,62 @@
+#include "error.h"
+#include "lexer.h"
+
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+bool print_token(lexer_token_t *token) {
+    lexer_token_print(token);
+    return true;
+}
+
+bool print_value(lexer_token_t *token) {
+    if (token->id == TOKEN_ERROR) {
+        printf("%s\n", token->value);
+        for (size_t i = 0; i < token->character_number; ++i)
+            printf(" ");
+        printf("^-- %s\n", token->explanation);
+    } else {
+        printf("%s", token->value);
+    }
+    return token->id != TOKEN_ERROR;
+}
+
+int main(int argc, char *argv[]) {
+    if (argc != 3 ||
+        (strcmp(argv[1], "-tokens") != 0 && strcmp(argv[1], "-text") != 0)) {
+        puts("Usage: oas -tokens <filename>");
+        puts("Usage: oas -text <filename>");
+        return 1;
+    }
+
+    bool (*print_fn)(lexer_token_t *);
+    char *filename = argv[2];
+    if (strcmp(argv[1], "-tokens") == 0) {
+        print_fn = print_token;
+    } else {
+        print_fn = print_value;
+    }
+
+    lexer_t lex = {0};
+    lexer_token_t token;
+    error_t *err = lexer_open(&lex, filename);
+    if (err) {
+        puts(err->message);
+        error_free(err);
+        return 1;
+    }
+
+    bool keep_going = true;
+    while (keep_going && (err = lexer_next(&lex, &token)) == nullptr) {
+        keep_going = print_fn(&token);
+        free(token.value);
+    }
+
+    if (err && err != err_eof) {
+        puts(err->message);
+    }
+    error_free(err);
+    return 0;
+}
--- a/tests/input/valid.asm
+++ b/tests/input/valid.asm
@@ -0,0 +1,9 @@
+_start:
+    mov eax, 555            ; move 555 into eax
+    push 0o777
+    xor eax, 0xDEADBEEF
+    and ecx, 0o770
+    mov edx, 0b01010101
+    push 0xffff:64
+    push 0o777:16
+    push 0b0001:16
--- a/validate.sh
+++ b/validate.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# Start with static analysis
+scan-build -o reports/ -plist-html --status-bugs make clean all
+
+# Run the sanitizer builds and valgrind
+make clean sanitize all
+
+ARGUMENTS=("-tokens" "-text")
+while IFS= read -r INPUT_FILE; do
+    for ARGS in ${ARGUMENTS[@]}; do
+        ./oas-asan $ARGS $INPUT_FILE > /dev/null
+        ./oas-msan $ARGS $INPUT_FILE > /dev/null
+        valgrind --leak-check=full --error-exitcode=1 ./oas $ARGS $INPUT_FILE >/dev/null
+    done
+done < <(find tests/input/ -type f -name '*.asm')