Added object file format spec

This is a WIP.
Update the test input file to contain all AST nodes
2025-04-03 00:49:31 +02:00 · 2025-04-02 21:41:27 +02:00 · 2025-04-02 20:57:02 +02:00 · 2025-04-02 20:57:02 +02:00 · 2025-04-02 20:56:59 +02:00 · 2025-04-02 20:41:49 +02:00
23 changed files with 1358 additions and 66 deletions
--- a/.gitea/workflows/validate.yaml
+++ b/.gitea/workflows/validate.yaml
@@ -0,0 +1,36 @@
+name: Validate the build 
+run-name: ${{ gitea.actor }} is validating code with clang 19
+on: [push]
+
+jobs:
+  validate-build:
+    runs-on: ubuntu-latest
+    container:
+      image: node:18-alpine
+    steps:
+      - name: Install dependencies
+        run: |
+          apk add --no-cache git make bash
+
+          # Install LLVM/Clang 19 from edge repository
+          echo "http://dl-cdn.alpinelinux.org/alpine/edge/main" >> /etc/apk/repositories
+          echo "http://dl-cdn.alpinelinux.org/alpine/edge/community" >> /etc/apk/repositories
+
+          apk update
+          apk add --no-cache llvm19 clang19 clang19-analyzer compiler-rt valgrind
+
+          # Verify versions
+          echo "---------------------"
+          echo "Clang version:"
+          clang --version
+          echo "---------------------"
+          echo "Valgrind version:"
+          valgrind --version
+          echo "---------------------"
+
+      - name: Check out repository code
+        uses: actions/checkout@v4
+
+      - name: make validate
+        run: |
+          make validate
--- a/12
+++ b/12
@@ -2,7 +2,7 @@

 CC=clang
 LD=clang
-CFLAGS=-Wall -Wextra -Wpedantic -O0 -g3 -std=c23 -fno-omit-frame-pointer -fno-optimize-sibling-calls
+CFLAGS=-Wall -Wextra -Wpedantic -O0 -g3 -std=c23 -fno-omit-frame-pointer -fno-optimize-sibling-calls -D_POSIX_C_SOURCE=200809L
 LDFLAGS?=

 SOURCES = $(shell find src/ -type f -name '*.c')
@@ -10,7 +10,7 @@ OBJECTS = $(SOURCES:.c=.o)
 DEPENDENCIES = $(SOURCES:.c=.d)
 TARGET?=oas
 OUTPUTS=oas oas-asan oas-msan oas-afl
-RUNARGUMENTS?=-tokens tests/input/valid.asm
+RUNARGUMENTS?=ast tests/input/valid.asm

 all: $(TARGET)
 	
@@ -25,8 +25,12 @@ fuzz:
 	afl-fuzz -i tests/input -o reports/afl -m none -- ./oas-afl -tokens @@

 sanitize:
-	make CFLAGS="$(CFLAGS) -fsanitize=address,undefined" LDFLAGS="-fsanitize=address,undefined" TARGET="oas-asan" clean-objects all
-	make CFLAGS="$(CFLAGS) -fsanitize=memory -fsanitize-memory-track-origins=2" LDFLAGS="-fsanitize=memory -fsanitize-memory-track-origins=2" TARGET="oas-msan" clean-objects all 
+	make CFLAGS="$(CFLAGS) -fsanitize=address,undefined" \
+		LDFLAGS="-fsanitize=address,undefined" \
+		TARGET="oas-asan" clean-objects all
+	make CFLAGS="$(CFLAGS) -fsanitize=memory -fsanitize-memory-track-origins=2" \
+		LDFLAGS="-fsanitize=memory -fsanitize-memory-track-origins=2" \
+		TARGET="oas-msan" clean-objects all 
 	make clean-objects

 validate:
--- a/doc/object_format.md
+++ b/doc/object_format.md
@@ -0,0 +1,55 @@
+# Linker file format
+
+```C
+
+struct object_file {
+    uint64_t    magic;   // ".oo-bin"
+    uint64_t    version; // 1
+    uint64_t    architecture; // AMD64(0)
+    uint64_t    offsets_offset;
+
+    struct offsets {
+        uint64_t strings;
+        uint64_t sections;
+        uint64_t symbols;
+        uint64_t relocations;
+    } offsets;
+
+    struct string_table {
+        uint64_t size;
+        uint8_t data[static size];
+    } strings;
+
+    struct section_table {
+        uint32_t count;
+        struct section_entry {
+            uint32_t name;
+            uint64_t offset;
+            uint64_t size_on_disk;
+            uint64_t size_in_memory;
+            uint64_t flags;
+        } sections[static count];
+    } sections;
+
+    struct symbol_table {
+        uint32_t count;
+        struct symbol_entry {
+            uint32_t name;
+            uint8_t  kind;  // IMPORT(0) | EXPORT(1) | LOCAL(2)
+            uint32_t section;
+            uint64_t offset;
+        } symbols[static count];
+    } symbols;
+
+    struct relocation_table {
+        uint32_t count;
+        struct relocation_entry {
+            uint32_t section;
+            uint64_t offset;
+            uint8_t  size;
+            uint32_t symbol;
+            uint8_t  kind;  // ABSOLUTE(0) | RELATIVE(1)
+        } relocations[static count];
+    } relocations;
+};
+```
--- a/doc/parser_grammar.txt
+++ b/doc/parser_grammar.txt
@@ -0,0 +1,39 @@
+<program>   ::= <statement>*
+<statement> ::= <label> | <directive> | <instruction>
+
+<label> ::= <identifier> <colon>
+
+<directive> ::= <dot> <section_directive>
+
+<section_directive> ::= "section" <identifier>
+
+<instruction> ::= <identifier> <operands>
+
+<operands> ::= <operand> ( <comma> <operand> )*
+
+<operand>  ::= <register> | <immediate> | <memory>
+
+<immediate> ::= <number> | <label_reference>
+
+<number> ::= <octal> | <binary> | <decimal> | <hexadecimal>
+
+<label_reference> ::= <identifier>
+
+<memory> ::= <lbracket> <memory_expression> <rbracket>
+
+<memory_expression> ::= <label_reference> | <register_expression>
+
+<register_expression> ::= <register> <register_index>? <register_offset>?
+
+<register_index> ::= <plus> <register> <asterisk> <number>
+
+<register_offset> ::= <plus_or_minus> <number>
+
+<plus_or_minus> ::= <plus> | <minus>
+
+
+/* These are lexer identifiers with the correct string value */
+<section> ::= "section"
+
+<register> ::= "rax" | "rbx" | "rcx" | "rdx" | "rsi" | "rdi" | "rbp" | "rsp" |
+"r8" | "r9" | "r10" | "r11" | "r12" | "r13" | "r14" | "r15"
--- a/src/ast.c
+++ b/src/ast.c
@@ -0,0 +1,187 @@
+#include "ast.h"
+#include "error.h"
+#include <assert.h>
+#include <string.h>
+
+error_t *err_node_children_cap = &(error_t){
+    .message = "Failed to increase ast node children, max capacity reached"};
+
+error_t *ast_node_alloc(ast_node_t **output) {
+    *output = nullptr;
+
+    ast_node_t *node = calloc(1, sizeof(ast_node_t));
+    if (node == nullptr)
+        return err_allocation_failed;
+
+    *output = node;
+    return nullptr;
+}
+
+void ast_node_free_value(ast_node_t *node) {
+    // TODO: decide how value ownership will work and clean it up here
+}
+
+void ast_node_free(ast_node_t *node) {
+    if (node == nullptr)
+        return;
+    if (node->children) {
+        for (size_t i = 0; i < node->len; ++i)
+            ast_node_free(node->children[i]);
+        free(node->children);
+    }
+
+    ast_node_free_value(node);
+
+    memset(node, 0, sizeof(ast_node_t));
+    free(node);
+}
+
+/**
+ * @pre node->children must be nullptr
+ */
+error_t *ast_node_alloc_children(ast_node_t *node) {
+    node->children = calloc(node_default_children_cap, sizeof(ast_node_t *));
+    if (node->children == nullptr)
+        return err_allocation_failed;
+
+    node->cap = node_default_children_cap;
+    return nullptr;
+}
+
+error_t *ast_node_grow_cap(ast_node_t *node) {
+    if (node->cap >= node_max_children_cap) {
+        return err_node_children_cap;
+    }
+
+    size_t new_cap = node->cap * 2;
+    if (new_cap > node_max_children_cap) {
+        new_cap = node_max_children_cap;
+    }
+
+    ast_node_t **new_children =
+        realloc(node->children, new_cap * sizeof(ast_node_t *));
+    if (new_children == nullptr) {
+        return err_allocation_failed;
+    }
+
+    node->children = new_children;
+    node->cap = new_cap;
+
+    return nullptr;
+}
+
+error_t *ast_node_add_child(ast_node_t *node, ast_node_t *child) {
+    error_t *err = nullptr;
+    if (node->children == nullptr)
+        err = ast_node_alloc_children(node);
+    else if (node->len >= node->cap)
+        err = ast_node_grow_cap(node);
+    if (err)
+        return err;
+
+    node->children[node->len] = child;
+    node->len += 1;
+
+    return nullptr;
+}
+
+const char *ast_node_id_to_cstr(node_id_t id) {
+    switch (id) {
+    case NODE_INVALID:
+        return "NODE_INVALID";
+    case NODE_PROGRAM:
+        return "NODE_PROGRAM";
+    case NODE_STATEMENT:
+        return "NODE_STATEMENT";
+    case NODE_LABEL:
+        return "NODE_LABEL";
+    case NODE_DIRECTIVE:
+        return "NODE_DIRECTIVE";
+    case NODE_INSTRUCTION:
+        return "NODE_INSTRUCTION";
+    case NODE_OPERANDS:
+        return "NODE_OPERANDS";
+    case NODE_OPERAND:
+        return "NODE_OPERAND";
+    case NODE_IMMEDIATE:
+        return "NODE_IMMEDIATE";
+    case NODE_MEMORY:
+        return "NODE_MEMORY";
+    case NODE_NUMBER:
+        return "NODE_NUMBER";
+    case NODE_LABEL_REFERENCE:
+        return "NODE_LABEL_REFERENCE";
+    case NODE_MEMORY_EXPRESSION:
+        return "NODE_MEMORY_EXPRESSION";
+    case NODE_REGISTER_EXPRESSION:
+        return "NODE_REGISTER_EXPRESSION";
+    case NODE_REGISTER_INDEX:
+        return "NODE_REGISTER_INDEX";
+    case NODE_REGISTER_OFFSET:
+        return "NODE_REGISTER_OFFSET";
+    case NODE_PLUS_OR_MINUS:
+        return "NODE_PLUS_OR_MINUS";
+    case NODE_SECTION_DIRECTIVE:
+        return "NODE_SECTION_DIRECTIVE";
+    case NODE_REGISTER:
+        return "NODE_REGISTER";
+    case NODE_SECTION:
+        return "NODE_SECTION";
+    case NODE_IDENTIFIER:
+        return "NODE_IDENTIFIER";
+    case NODE_DECIMAL:
+        return "NODE_DECIMAL";
+    case NODE_HEXADECIMAL:
+        return "NODE_HEXADECIMAL";
+    case NODE_OCTAL:
+        return "NODE_OCTAL";
+    case NODE_BINARY:
+        return "NODE_BINARY";
+    case NODE_CHAR:
+        return "NODE_CHAR";
+    case NODE_STRING:
+        return "NODE_STRING";
+    case NODE_COLON:
+        return "NODE_COLON";
+    case NODE_COMMA:
+        return "NODE_COMMA";
+    case NODE_LBRACKET:
+        return "NODE_LBRACKET";
+    case NODE_RBRACKET:
+        return "NODE_RBRACKET";
+    case NODE_PLUS:
+        return "NODE_PLUS";
+    case NODE_MINUS:
+        return "NODE_MINUS";
+    case NODE_ASTERISK:
+        return "NODE_ASTERISK";
+    case NODE_DOT:
+        return "NODE_DOT";
+    }
+    assert(!"Unreachable, weird node id" && id);
+    __builtin_unreachable();
+}
+
+static void ast_node_print_internal(ast_node_t *node, int indent) {
+    if (node == NULL) {
+        return;
+    }
+
+    for (int i = 0; i < indent; i++) {
+        printf("  ");
+    }
+    printf("%s", ast_node_id_to_cstr(node->id));
+
+    if (node->token_entry && node->token_entry->token.value) {
+        printf(" \"%s\"", node->token_entry->token.value);
+    }
+    printf("\n");
+
+    for (size_t i = 0; i < node->len; i++) {
+        ast_node_print_internal(node->children[i], indent + 1);
+    }
+}
+
+void ast_node_print(ast_node_t *node) {
+    ast_node_print_internal(node, 0);
+}
--- a/src/ast.h
+++ b/src/ast.h
@@ -0,0 +1,120 @@
+#ifndef INCLUDE_SRC_AST_H_
+#define INCLUDE_SRC_AST_H_
+
+#include "error.h"
+#include "lexer.h"
+#include "tokenlist.h"
+#include <stddef.h>
+#include <stdint.h>
+
+typedef enum node_id {
+    NODE_INVALID,
+
+    NODE_PROGRAM,
+    NODE_STATEMENT,
+    NODE_LABEL,
+    NODE_DIRECTIVE,
+    NODE_INSTRUCTION,
+    NODE_OPERANDS,
+    NODE_OPERAND,
+    NODE_IMMEDIATE,
+    NODE_MEMORY,
+    NODE_NUMBER,
+    NODE_LABEL_REFERENCE,
+    NODE_MEMORY_EXPRESSION,
+    NODE_REGISTER_EXPRESSION,
+    NODE_REGISTER_INDEX,
+    NODE_REGISTER_OFFSET,
+    NODE_PLUS_OR_MINUS,
+    NODE_SECTION_DIRECTIVE,
+
+    // Validated primitives
+    NODE_REGISTER,
+    NODE_SECTION,
+
+    // Primitive nodes
+    NODE_IDENTIFIER,
+    NODE_DECIMAL,
+    NODE_HEXADECIMAL,
+    NODE_OCTAL,
+    NODE_BINARY,
+    NODE_CHAR,
+    NODE_STRING,
+    NODE_COLON,
+    NODE_COMMA,
+    NODE_LBRACKET,
+    NODE_RBRACKET,
+    NODE_PLUS,
+    NODE_MINUS,
+    NODE_ASTERISK,
+    NODE_DOT,
+} node_id_t;
+
+typedef struct ast_node ast_node_t;
+
+constexpr size_t node_default_children_cap = 8;
+/* 65K ought to be enough for anybody */
+constexpr size_t node_max_children_cap = 1 << 16;
+
+struct ast_node {
+    node_id_t id;
+    tokenlist_entry_t *token_entry;
+    size_t len;
+    size_t cap;
+    ast_node_t **children;
+
+    union {
+        struct {
+            uint64_t value;
+            int size;
+        } integer;
+        char *name;
+    } value;
+};
+
+/**
+ * @brief Allocates a new AST node
+ *
+ * Creates and initializes a new AST node with default (zero) values.
+ *
+ * @param[out] output Pointer to store the allocated node
+ * @return error_t* nullptr on success, allocation error on failure
+ */
+error_t *ast_node_alloc(ast_node_t **node);
+
+/**
+ * @brief Frees an AST node and all its children recursively
+ *
+ * Recursively frees all children of the node, then frees the node itself.
+ * If node is nullptr, the function returns without doing anything.
+ *
+ * @param node The node to free
+ */
+void ast_node_free(ast_node_t *node);
+
+/**
+ * @brief Adds a child node to a parent node
+ *
+ * Adds the specified child node to the parent's children array.
+ * If this is the first child, the function allocates the children array.
+ * If the children array is full, the function increases its capacity.
+ *
+ * @param node The parent node to add the child to
+ * @param child The child node to add
+ * @return error_t* nullptr on success, allocation error on failure,
+ *                  or err_node_children_cap if maximum capacity is reached
+ */
+error_t *ast_node_add_child(ast_node_t *node, ast_node_t *child);
+
+/**
+ * @brief Prints an AST starting from the given node
+ *
+ * Prints a representation of the AST with indentation to show structure.
+ * Each node's type is shown, and if a node has an associated token value,
+ * that value is printed in quotes.
+ *
+ * @param node The root node of the AST to print
+ */
+void ast_node_print(ast_node_t *node);
+
+#endif // INCLUDE_SRC_AST_H_
--- a/src/error.c
+++ b/src/error.c
@@ -10,6 +10,9 @@ error_t *const err_errorf_length = &(error_t){
    .message =
        "Formatting of another error failed to determine the error length"};

+error_t *err_allocation_failed =
+    &(error_t){.message = "Memory allocation failed"};
+
 error_t *errorf(const char *fmt, ...) {
    error_t *err = calloc(1, sizeof(error_t));
    if (err == nullptr)
--- a/src/error.h
+++ b/src/error.h
@@ -18,4 +18,7 @@ static inline void error_free(error_t *err) {
    free(err);
 }

+/* Some global errors */
+extern error_t *err_allocation_failed;
+
 #endif // INCLUDE_SRC_ERROR_H_
--- a/src/lexer.c
+++ b/src/lexer.c
@@ -20,9 +20,6 @@ error_t *err_eof =

 error_t *err_unknown_read = &(error_t){.message = "Unknown read error"};

-error_t *err_allocation_failed =
-    &(error_t){.message = "Memory allocation failed"};
-
 typedef bool (*char_predicate_t)(char);

 const char *lexer_token_id_to_cstr(lexer_token_id_t id) {
@@ -89,6 +86,15 @@ void lexer_close(lexer_t *lex) {
    memset(lex, 0, sizeof(lexer_t));
 }

+/**
+ * Attempts to fill the lexer's internal buffer with more data from the file.
+ * Only reads data if the buffer isn't already full and the file hasn't reached
+ * EOF.
+ *
+ * @param lex The lexer to fill the buffer for
+ * @return nullptr on success, an error otherwise (including err_eof if EOF
+ * reached with empty buffer)
+ */
 error_t *lexer_fill_buffer(lexer_t *lex) {
    if (feof(lex->fp) && lex->buffer_count == 0)
        return err_eof;
@@ -126,25 +132,28 @@ error_t *lexer_open(lexer_t *lex, char *path) {
    return nullptr;
 }

+/**
+ * Shifts the lexer's buffer by n characters, discarding the first n characters
+ * and moving the remaining characters to the beginning of the buffer.
+ *
+ * @param lex The lexer whose buffer to shift
+ * @param n Number of characters to shift out
+ *
+ * @pre There must be at least n characters in the input buffer
+ */
 void lexer_shift_buffer(lexer_t *lex, int n) {
+    assert(lex->buffer_count >= n);
    lex->buffer_count -= n;
    memmove(lex->buffer, lex->buffer + n, lex->buffer_count);
 }

-error_t *lexer_peek(lexer_t *lex, char *c) {
-    error_t *err = lexer_fill_buffer(lex);
-    if (err)
-        return err;
-    if (lex->buffer_count == 0)
-        return err_eof;
-    *c = lex->buffer[0];
-    lexer_shift_buffer(lex, 1);
-    return nullptr;
-}
-
-// This does _not_ fill the internal lexer buffer and you _must_ call
-// lexer_fill_buffer() before calling this. It will always return false if your
-// prefix is larger than lexer_buffer_size
+/**
+ * Checks if the lexer's buffer starts with the given prefix.
+ *
+ * @param lex The lexer to check
+ * @param prefix The string prefix to check for
+ * @return true if the buffer starts with the prefix, false otherwise
+ */
 bool lexer_has_prefix(lexer_t *lex, char *prefix) {
    size_t len = strlen(prefix);
    if (len > lex->buffer_count)
@@ -159,17 +168,42 @@ error_t *lexer_not_implemented(lexer_t *lex, lexer_token_t *token) {
                  lex->character_number);
 }

+/**
+ * Consumes exactly n characters from the buffer into the provided output
+ * buffer.
+ *
+ * @param lex The lexer to consume from
+ * @param len Size of the output buffer
+ * @param buffer Output buffer to store the consumed characters
+ * @param n Number of characters to consume
+ * @return nullptr on success, an error otherwise (err_buffer_underrun if buffer
+ * contains fewer than n characters)
+ */
 error_t *lexer_consume_n(lexer_t *lex, const size_t len,
                         char buffer[static len], const size_t n) {
    if (lex->buffer_count < n)
        return err_buffer_underrun;
-    if (len > n)
+    if (n > len)
        return err_consume_excessive_length;

    memcpy(buffer, lex->buffer, n);
    lexer_shift_buffer(lex, n);
    return nullptr;
 }
+
+/**
+ * Consumes characters from the lexer buffer that satisfy the predicate
+ * function. Will attempt to refill the buffer if more valid characters are
+ * available.
+ *
+ * @param lex The lexer to consume from
+ * @param n Maximum number of characters to consume
+ * @param buffer Output buffer to store consumed characters
+ * @param is_valid Function that determines if a character should be consumed
+ * @param n_consumed Output parameter that will contain the number of characters
+ * consumed
+ * @return nullptr on success, an error otherwise
+ */
 error_t *lexer_consume(lexer_t *lex, const size_t n, char buffer[static n],
                       char_predicate_t is_valid, size_t *n_consumed) {
    const size_t buffer_size = n;
@@ -217,6 +251,18 @@ bool is_decimal_character(char c) {
    return isdigit(c);
 }

+/**
+ * Processes a number token (decimal, hexadecimal, octal, or binary).
+ * Handles number formats with optional size suffixes.
+ *
+ * @param lex The lexer to read from
+ * @param token Output parameter that will be populated with the token
+ * information
+ * @return nullptr on success, an error otherwise
+ *
+ * @pre There must be at least one character in the input buffer and it should
+ * be [0-9]
+ */
 error_t *lexer_next_number(lexer_t *lex, lexer_token_t *token) {
    constexpr size_t max_number_length = 128;
    size_t so_far = 0;
@@ -294,6 +340,19 @@ error_t *lexer_next_number(lexer_t *lex, lexer_token_t *token) {
    token->value = strdup(buffer);
    return nullptr;
 }
+
+/**
+ * Processes a newline token (\n or \r\n).
+ * Updates the lexer's line and character position tracking.
+ *
+ * @param lex The lexer to read from
+ * @param token Output parameter that will be populated with the token
+ * information
+ * @return nullptr on success, an error otherwise
+ *
+ * @pre There must be at least on character in the input buffer and it must
+ * be [\r\n]
+ */
 error_t *lexer_next_newline(lexer_t *lex, lexer_token_t *token) {
    token->line_number = lex->line_number;
    token->character_number = lex->character_number;
@@ -323,6 +382,19 @@ bool is_identifier_character(char c) {
    return isalnum(c) || c == '_';
 }

+/**
+ * Processes an identifier token.
+ * Identifiers start with a letter or underscore and can contain alphanumeric
+ * characters or underscores.
+ *
+ * @param lex The lexer to read from
+ * @param token Output parameter that will be populated with the token
+ * information
+ * @return nullptr on success, an error otherwise
+ *
+ * @pre There must be at least 1 character in the read buffer and it must be
+ * [a-zA-Z_]
+ */
 error_t *lexer_next_identifier(lexer_t *lex, lexer_token_t *token) {
    constexpr size_t max_identifier_length = 128;
    size_t n = 0;
@@ -355,6 +427,17 @@ bool is_whitespace_character(char c) {
    return c == ' ' || c == '\t';
 }

+/**
+ * Processes a whitespace token (spaces and tabs).
+ *
+ * @param lex The lexer to read from
+ * @param token Output parameter that will be populated with the token
+ * information
+ * @return nullptr on success, an error otherwise
+ *
+ * @pre There must be at least one character in the buffer and it must be
+ * [ \t]
+ */
 error_t *lexer_next_whitespace(lexer_t *lex, lexer_token_t *token) {
    constexpr size_t max_whitespace_length = 1024;
    size_t n = 0;
@@ -380,6 +463,16 @@ bool is_comment_character(char c) {
    return c != '\r' && c != '\n';
 }

+/**
+ * Processes a comment token (starts with ';' and continues to end of line).
+ *
+ * @param lex The lexer to read from
+ * @param token Output parameter that will be populated with the token
+ * information
+ * @return nullptr on success, an error otherwise
+ *
+ * @pre There must be at least one character in the buffer and it must be ';'
+ */
 error_t *lexer_next_comment(lexer_t *lex, lexer_token_t *token) {
    constexpr size_t max_comment_length = 1024;
    size_t n = 0;
--- a/src/lexer.h
+++ b/src/lexer.h
@@ -47,10 +47,43 @@ typedef struct lexer {
    FILE *fp;
 } lexer_t;

+/**
+ * @brief Closes a lexer and releases associated resources
+ *
+ * @param lex Pointer to the lexer to close
+ */
 void lexer_close(lexer_t *lex);
+
+/**
+ * @brief Opens a file for lexical analysis
+ *
+ * @param lex Pointer to the lexer to initialize
+ * @param path Path to the file to open
+ * @return error_t* nullptr on success, or error describing the failure
+ */
 error_t *lexer_open(lexer_t *lex, char *path);
+
+/**
+ * @brief Reads the next token from the input stream
+ *
+ * @param lex Pointer to an initialized lexer
+ * @param token Pointer to a token structure to fill with the next token
+ * @return error_t* nullptr on success, err_eof at end of file, or other error
+ */
 error_t *lexer_next(lexer_t *lex, lexer_token_t *token);
+
+/**
+ * @brief Prints a token to stdout for debugging purposes
+ *
+ * @param token Pointer to the token to print
+ */
 void lexer_token_print(lexer_token_t *token);
+
+/**
+ * @brief Frees any resources associated with a token
+ *
+ * @param token Pointer to the token to clean up
+ */
 void lexer_token_cleanup(lexer_token_t *token);

 #endif // INCLUDE_SRC_LEXER_H_
--- a/src/main.c
+++ b/src/main.c
@@ -1,62 +1,108 @@
 #include "error.h"
 #include "lexer.h"
+#include "parser/parser.h"
+#include "tokenlist.h"

 #include <limits.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

-bool print_token(lexer_token_t *token) {
-    lexer_token_print(token);
-    return true;
+typedef enum mode { MODE_AST, MODE_TEXT, MODE_TOKENS } mode_t;
+
+void print_tokens(tokenlist_t *list) {
+    for (auto entry = list->head; entry; entry = entry->next) {
+        auto token = &entry->token;
+        lexer_token_print(token);
+    }
 }

-bool print_value(lexer_token_t *token) {
-    if (token->id == TOKEN_ERROR) {
-        printf("%s\n", token->value);
-        for (size_t i = 0; i < token->character_number; ++i)
-            printf(" ");
-        printf("^-- %s\n", token->explanation);
-    } else {
-        printf("%s", token->value);
+void print_text(tokenlist_t *list) {
+    for (auto entry = list->head; entry; entry = entry->next) {
+        auto token = &entry->token;
+        if (token->id == TOKEN_ERROR) {
+            printf("%s\n", token->value);
+            for (size_t i = 0; i < token->character_number; ++i)
+                printf(" ");
+            printf("^-- %s\n", token->explanation);
+            return;
+        } else {
+            printf("%s", token->value);
+        }
    }
-    return token->id != TOKEN_ERROR;
+}
+
+void print_ast(tokenlist_t *list) {
+    parse_result_t result = parse(list->head);
+    if (result.err) {
+        puts(result.err->message);
+        error_free(result.err);
+        return;
+    }
+    ast_node_print(result.node);
+
+    if (result.next != nullptr) {
+        puts("First unparsed token:");
+        lexer_token_print(&result.next->token);
+    }
+
+    ast_node_free(result.node);
+}
+
+int get_execution_mode(int argc, char *argv[]) {
+    if (argc != 3 || (strcmp(argv[1], "tokens") != 0 &&
+                      strcmp(argv[1], "text") != 0 && strcmp(argv[1], "ast"))) {
+        puts("Usage: oas [tokens|text|ast] <filename>");
+        exit(1);
+    }
+
+    if (strcmp(argv[1], "tokens") == 0)
+        return MODE_TOKENS;
+    if (strcmp(argv[1], "text") == 0)
+        return MODE_TEXT;
+    return MODE_AST;
 }

 int main(int argc, char *argv[]) {
-    if (argc != 3 ||
-        (strcmp(argv[1], "-tokens") != 0 && strcmp(argv[1], "-text") != 0)) {
-        puts("Usage: oas -tokens <filename>");
-        puts("Usage: oas -text <filename>");
-        return 1;
-    }
-
-    bool (*print_fn)(lexer_token_t *);
+    mode_t mode = get_execution_mode(argc, argv);
    char *filename = argv[2];
-    if (strcmp(argv[1], "-tokens") == 0) {
-        print_fn = print_token;
-    } else {
-        print_fn = print_value;
+
+    lexer_t *lex = &(lexer_t){};
+    error_t *err = lexer_open(lex, filename);
+    if (err)
+        goto cleanup_error;
+
+    tokenlist_t *list;
+    err = tokenlist_alloc(&list);
+    if (err)
+        goto cleanup_lexer;
+
+    err = tokenlist_fill(list, lex);
+    if (err)
+        goto cleanup_tokens;
+
+    switch (mode) {
+    case MODE_TOKENS:
+        print_tokens(list);
+        break;
+    case MODE_TEXT:
+        print_text(list);
+        break;
+    case MODE_AST:
+        print_ast(list);
+        break;
    }

-    lexer_t lex = {0};
-    lexer_token_t token;
-    error_t *err = lexer_open(&lex, filename);
-    if (err) {
-        puts(err->message);
-        error_free(err);
-        return 1;
-    }
-
-    bool keep_going = true;
-    while (keep_going && (err = lexer_next(&lex, &token)) == nullptr) {
-        keep_going = print_fn(&token);
-        free(token.value);
-    }
-
-    if (err && err != err_eof) {
-        puts(err->message);
-    }
+    tokenlist_free(list);
    error_free(err);
    return 0;
+
+cleanup_tokens:
+    tokenlist_free(list);
+cleanup_lexer:
+    lexer_close(lex);
+cleanup_error:
+    puts(err->message);
+    error_free(err);
+    return 1;
 }
--- a/src/parser/combinators.c
+++ b/src/parser/combinators.c
@@ -0,0 +1,126 @@
+#include "combinators.h"
+
+// Parse a list of the given parser delimited by the given token id. Does not
+// store the delimiters in the parent node
+parse_result_t parse_list(tokenlist_entry_t *current, node_id_t id,
+                          bool allow_none, lexer_token_id_t delimiter_id,
+                          parser_t parser) {
+    ast_node_t *many;
+    error_t *err = ast_node_alloc(&many);
+    parse_result_t result;
+    if (err)
+        return parse_error(err);
+    many->id = id;
+
+    while (current) {
+        // Skip beyond the delimiter on all but the first iteration
+        if (many->len > 0) {
+            if (current->token.id != delimiter_id)
+                break;
+            current = tokenlist_next(current);
+            if (current == nullptr) {
+                // FIXME: this isn't quite right, we can't consume the delimiter
+                // if the next element will fail to parse but it's late and I
+                // must think this through tomorrow
+                break;
+            }
+        }
+
+        result = parser(current);
+        if (result.err == err_parse_no_match)
+            break;
+        if (result.err) {
+            ast_node_free(many);
+            return result;
+        }
+        err = ast_node_add_child(many, result.node);
+        if (err) {
+            ast_node_free(many);
+            ast_node_free(result.node);
+            return parse_error(err);
+        }
+        current = result.next;
+    }
+
+    if (!allow_none && many->len == 0) {
+        ast_node_free(many);
+        return parse_no_match();
+    }
+    return parse_success(many, current);
+}
+
+parse_result_t parse_any(tokenlist_entry_t *current, parser_t parsers[]) {
+    parser_t parser;
+    while ((parser = *parsers++)) {
+        parse_result_t result = parser(current);
+        if (result.err == nullptr)
+            return result;
+    }
+    return parse_no_match();
+}
+
+// parse as many of the giver parsers objects in a row as possible,
+// potentially allowing none wraps the found objects in a new ast node with
+// the given note id
+parse_result_t parse_many(tokenlist_entry_t *current, node_id_t id,
+                          bool allow_none, parser_t parser) {
+    ast_node_t *many;
+    error_t *err = ast_node_alloc(&many);
+    parse_result_t result;
+    if (err)
+        return parse_error(err);
+    many->id = id;
+
+    while (current) {
+        result = parser(current);
+        if (result.err == err_parse_no_match)
+            break;
+        if (result.err) {
+            ast_node_free(many);
+            return result;
+        }
+        err = ast_node_add_child(many, result.node);
+        if (err) {
+            ast_node_free(many);
+            ast_node_free(result.node);
+            return parse_error(err);
+        }
+        current = result.next;
+    }
+
+    if (!allow_none && many->len == 0) {
+        ast_node_free(many);
+        return parse_no_match();
+    }
+    return parse_success(many, current);
+}
+
+// Parse all tries to parse all parsers consecutively and if it succeeds it
+// wraps the parsed nodes in a new parent node.
+parse_result_t parse_consecutive(tokenlist_entry_t *current, node_id_t id,
+                                 parser_t parsers[]) {
+    ast_node_t *all;
+    error_t *err = ast_node_alloc(&all);
+    parse_result_t result;
+    if (err)
+        return parse_error(err);
+
+    all->id = id;
+
+    parser_t parser;
+    while ((parser = *parsers++) && current) {
+        result = parser(current);
+        if (result.err) {
+            ast_node_free(all);
+            return result;
+        }
+        err = ast_node_add_child(all, result.node);
+        if (err) {
+            ast_node_free(result.node);
+            ast_node_free(all);
+            return parse_error(err);
+        }
+        current = result.next;
+    }
+    return parse_success(all, current);
+}
--- a/src/parser/combinators.h
+++ b/src/parser/combinators.h
@@ -0,0 +1,25 @@
+#ifndef INCLUDE_PARSER_COMBINATORS_H_
+#define INCLUDE_PARSER_COMBINATORS_H_
+
+#include "util.h"
+
+typedef parse_result_t (*parser_t)(tokenlist_entry_t *);
+
+parse_result_t parse_any(tokenlist_entry_t *current, parser_t parsers[]);
+
+// parse as many of the giver parsers objects in a row as possible, potentially
+// allowing none wraps the found objects in a new ast node with the given note
+// id
+parse_result_t parse_many(tokenlist_entry_t *current, node_id_t id,
+                          bool allow_none, parser_t parser);
+
+parse_result_t parse_list(tokenlist_entry_t *current, node_id_t id,
+                          bool allow_none, lexer_token_id_t delimiter_id,
+                          parser_t parser);
+
+// Parse all tries to parse all parsers consecutively and if it succeeds it
+// wraps the parsed nodes in a new parent node.
+parse_result_t parse_consecutive(tokenlist_entry_t *current, node_id_t id,
+                                 parser_t parsers[]);
+
+#endif // INCLUDE_PARSER_COMBINATORS_H_
--- a/src/parser/parser.c
+++ b/src/parser/parser.c
@@ -0,0 +1,140 @@
+#include "parser.h"
+#include "../ast.h"
+#include "../lexer.h"
+#include "../tokenlist.h"
+#include "combinators.h"
+#include "primitives.h"
+#include "util.h"
+
+parse_result_t parse_number(tokenlist_entry_t *current) {
+    parser_t parsers[] = {parse_octal, parse_decimal, parse_hexadecimal,
+                          parse_binary, nullptr};
+    parse_result_t result = parse_any(current, parsers);
+    return parse_result_wrap(NODE_NUMBER, result);
+}
+
+parse_result_t parse_plus_or_minus(tokenlist_entry_t *current) {
+    parser_t parsers[] = {parse_plus, parse_minus, nullptr};
+    return parse_any(current, parsers);
+}
+
+parse_result_t parse_register_index(tokenlist_entry_t *current) {
+    parser_t parsers[] = {parse_plus, parse_register, parse_asterisk,
+                          parse_number, nullptr};
+    return parse_consecutive(current, NODE_REGISTER_INDEX, parsers);
+}
+
+parse_result_t parse_register_offset(tokenlist_entry_t *current) {
+    parser_t parsers[] = {parse_plus_or_minus, parse_number, nullptr};
+    return parse_consecutive(current, NODE_REGISTER_OFFSET, parsers);
+}
+
+parse_result_t parse_register_expression(tokenlist_entry_t *current) {
+    parse_result_t result;
+
+    ast_node_t *expr;
+    error_t *err = ast_node_alloc(&expr);
+    if (err)
+        return parse_error(err);
+    expr->id = NODE_REGISTER_EXPRESSION;
+
+    // <register>
+    result = parse_register(current);
+    if (result.err) {
+        ast_node_free(expr);
+        return result;
+    }
+    err = ast_node_add_child(expr, result.node);
+    if (err) {
+        ast_node_free(result.node);
+        ast_node_free(expr);
+        return parse_error(err);
+    }
+    current = result.next;
+
+    // <register_index>?
+    result = parse_register_index(current);
+    if (result.err) {
+        error_free(result.err);
+    } else {
+        err = ast_node_add_child(expr, result.node);
+        if (err) {
+            ast_node_free(result.node);
+            ast_node_free(expr);
+            return parse_error(err);
+        }
+        current = result.next;
+    }
+
+    // <register_offset>?
+    result = parse_register_offset(current);
+    if (result.err) {
+        error_free(result.err);
+    } else {
+        err = ast_node_add_child(expr, result.node);
+        if (err) {
+            ast_node_free(result.node);
+            ast_node_free(expr);
+            return parse_error(err);
+        }
+        current = result.next;
+    }
+    return parse_success(expr, current);
+}
+
+parse_result_t parse_immediate(tokenlist_entry_t *current) {
+    parser_t parsers[] = {parse_number, parse_identifier, nullptr};
+    parse_result_t result = parse_any(current, parsers);
+    return parse_result_wrap(NODE_IMMEDIATE, result);
+}
+
+parse_result_t parse_memory_expression(tokenlist_entry_t *current) {
+    parser_t parsers[] = {parse_register_expression, parse_identifier, nullptr};
+    return parse_any(current, parsers);
+}
+
+parse_result_t parse_memory(tokenlist_entry_t *current) {
+    parser_t parsers[] = {parse_lbracket, parse_memory_expression,
+                          parse_rbracket, nullptr};
+    return parse_consecutive(current, NODE_MEMORY, parsers);
+}
+
+parse_result_t parse_operand(tokenlist_entry_t *current) {
+    parser_t parsers[] = {parse_register, parse_memory, parse_immediate,
+                          nullptr};
+    return parse_any(current, parsers);
+}
+
+parse_result_t parse_operands(tokenlist_entry_t *current) {
+    return parse_list(current, NODE_OPERANDS, true, TOKEN_COMMA, parse_operand);
+}
+
+parse_result_t parse_label(tokenlist_entry_t *current) {
+    parser_t parsers[] = {parse_identifier, parse_colon, nullptr};
+    return parse_consecutive(current, NODE_LABEL, parsers);
+}
+
+parse_result_t parse_section_directive(tokenlist_entry_t *current) {
+    parser_t parsers[] = {parse_section, parse_identifier, nullptr};
+    return parse_consecutive(current, NODE_SECTION_DIRECTIVE, parsers);
+}
+
+parse_result_t parse_directive(tokenlist_entry_t *current) {
+    parser_t parsers[] = {parse_dot, parse_section_directive, nullptr};
+    return parse_consecutive(current, NODE_DIRECTIVE, parsers);
+}
+
+parse_result_t parse_instruction(tokenlist_entry_t *current) {
+    parser_t parsers[] = {parse_identifier, parse_operands, nullptr};
+    return parse_consecutive(current, NODE_INSTRUCTION, parsers);
+}
+
+parse_result_t parse_statement(tokenlist_entry_t *current) {
+    parser_t parsers[] = {parse_label, parse_directive, parse_instruction,
+                          nullptr};
+    return parse_any(current, parsers);
+}
+
+parse_result_t parse(tokenlist_entry_t *current) {
+    return parse_many(current, NODE_PROGRAM, true, parse_statement);
+}
--- a/src/parser/parser.h
+++ b/src/parser/parser.h
@@ -0,0 +1,9 @@
+#ifndef INCLUDE_PARSER_PARSER_H_
+#define INCLUDE_PARSER_PARSER_H_
+
+#include "../tokenlist.h"
+#include "util.h"
+
+parse_result_t parse(tokenlist_entry_t *current);
+
+#endif // INCLUDE_PARSER_PARSER_H_
--- a/src/parser/primitives.c
+++ b/src/parser/primitives.c
@@ -0,0 +1,103 @@
+#include "primitives.h"
+#include "../ast.h"
+#include <string.h>
+
+parse_result_t parse_identifier(tokenlist_entry_t *current) {
+    return parse_token(current, TOKEN_IDENTIFIER, NODE_IDENTIFIER, nullptr);
+}
+
+parse_result_t parse_decimal(tokenlist_entry_t *current) {
+    return parse_token(current, TOKEN_DECIMAL, NODE_DECIMAL, nullptr);
+}
+
+parse_result_t parse_hexadecimal(tokenlist_entry_t *current) {
+    return parse_token(current, TOKEN_HEXADECIMAL, NODE_HEXADECIMAL, nullptr);
+}
+
+parse_result_t parse_binary(tokenlist_entry_t *current) {
+    return parse_token(current, TOKEN_BINARY, NODE_BINARY, nullptr);
+}
+
+parse_result_t parse_octal(tokenlist_entry_t *current) {
+    return parse_token(current, TOKEN_OCTAL, NODE_OCTAL, nullptr);
+}
+
+parse_result_t parse_string(tokenlist_entry_t *current) {
+    return parse_token(current, TOKEN_STRING, NODE_STRING, nullptr);
+}
+
+parse_result_t parse_char(tokenlist_entry_t *current) {
+    return parse_token(current, TOKEN_CHAR, NODE_CHAR, nullptr);
+}
+
+parse_result_t parse_colon(tokenlist_entry_t *current) {
+    return parse_token(current, TOKEN_COLON, NODE_COLON, nullptr);
+}
+
+parse_result_t parse_comma(tokenlist_entry_t *current) {
+    return parse_token(current, TOKEN_COMMA, NODE_COMMA, nullptr);
+}
+
+parse_result_t parse_lbracket(tokenlist_entry_t *current) {
+    return parse_token(current, TOKEN_LBRACKET, NODE_LBRACKET, nullptr);
+}
+
+parse_result_t parse_rbracket(tokenlist_entry_t *current) {
+    return parse_token(current, TOKEN_RBRACKET, NODE_RBRACKET, nullptr);
+}
+
+parse_result_t parse_plus(tokenlist_entry_t *current) {
+    return parse_token(current, TOKEN_PLUS, NODE_PLUS, nullptr);
+}
+
+parse_result_t parse_minus(tokenlist_entry_t *current) {
+    return parse_token(current, TOKEN_MINUS, NODE_MINUS, nullptr);
+}
+
+parse_result_t parse_asterisk(tokenlist_entry_t *current) {
+    return parse_token(current, TOKEN_ASTERISK, NODE_ASTERISK, nullptr);
+}
+
+parse_result_t parse_dot(tokenlist_entry_t *current) {
+    return parse_token(current, TOKEN_DOT, NODE_DOT, nullptr);
+}
+
+parse_result_t parse_label_reference(tokenlist_entry_t *current) {
+    return parse_token(current, TOKEN_IDENTIFIER, NODE_LABEL_REFERENCE,
+                       nullptr);
+}
+
+const char *registers[] = {
+    // 64-bit registers
+    "rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi", "r8", "r9", "r10",
+    "r11", "r12", "r13", "r14", "r15",
+    // 32-bit registers
+    "eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi", "r8d", "r9d",
+    "r10d", "r11d", "r12d", "r13d", "r14d", "r15d",
+    // 16-bit registers
+    "ax", "cx", "dx", "bx", "sp", "bp", "si", "di", "r8w", "r9w", "r10w",
+    "r11w", "r12w", "r13w", "r14w", "r15w",
+    // 8-bit low registers
+    "al", "cl", "dl", "bl", "spl", "bpl", "sil", "dil", "r8b", "r9b", "r10b",
+    "r11b", "r12b", "r13b", "r14b", "r15b", nullptr};
+
+bool is_register_token(lexer_token_t *token) {
+    for (size_t i = 0; registers[i] != nullptr; ++i)
+        if (strcmp(token->value, registers[i]) == 0)
+            return true;
+    return false;
+}
+
+parse_result_t parse_register(tokenlist_entry_t *current) {
+    return parse_token(current, TOKEN_IDENTIFIER, NODE_REGISTER,
+                       is_register_token);
+}
+
+bool is_section_token(lexer_token_t *token) {
+    return strcmp(token->value, "section") == 0;
+}
+
+parse_result_t parse_section(tokenlist_entry_t *current) {
+    return parse_token(current, TOKEN_IDENTIFIER, NODE_SECTION,
+                       is_section_token);
+}
--- a/src/parser/primitives.h
+++ b/src/parser/primitives.h
@@ -0,0 +1,30 @@
+#ifndef INCLUDE_PARSER_PRIMITIVES_H_
+#define INCLUDE_PARSER_PRIMITIVES_H_
+
+#include "util.h"
+
+parse_result_t parse_identifier(tokenlist_entry_t *current);
+parse_result_t parse_decimal(tokenlist_entry_t *current);
+parse_result_t parse_hexadecimal(tokenlist_entry_t *current);
+parse_result_t parse_binary(tokenlist_entry_t *current);
+parse_result_t parse_octal(tokenlist_entry_t *current);
+parse_result_t parse_string(tokenlist_entry_t *current);
+parse_result_t parse_char(tokenlist_entry_t *current);
+parse_result_t parse_colon(tokenlist_entry_t *current);
+parse_result_t parse_comma(tokenlist_entry_t *current);
+parse_result_t parse_lbracket(tokenlist_entry_t *current);
+parse_result_t parse_rbracket(tokenlist_entry_t *current);
+parse_result_t parse_plus(tokenlist_entry_t *current);
+parse_result_t parse_minus(tokenlist_entry_t *current);
+parse_result_t parse_asterisk(tokenlist_entry_t *current);
+parse_result_t parse_dot(tokenlist_entry_t *current);
+parse_result_t parse_label_reference(tokenlist_entry_t *current);
+
+/* These are "primitives" with a different name and some extra validation on top
+ * for example, register is just an identifier but it only matches a limited set
+ * of values
+ */
+parse_result_t parse_register(tokenlist_entry_t *current);
+parse_result_t parse_section(tokenlist_entry_t *current);
+
+#endif // INCLUDE_PARSER_PRIMITIVES_H_
--- a/src/parser/util.c
+++ b/src/parser/util.c
@@ -0,0 +1,56 @@
+#include "util.h"
+#include "../tokenlist.h"
+
+error_t *err_parse_no_match =
+    &(error_t){.message = "parsing failed to find the correct token sequence"};
+
+parse_result_t parse_error(error_t *err) {
+    return (parse_result_t){.err = err};
+}
+
+parse_result_t parse_no_match() {
+    return parse_error(err_parse_no_match);
+}
+
+parse_result_t parse_success(ast_node_t *ast, tokenlist_entry_t *next) {
+    next = tokenlist_skip_trivia(next);
+    return (parse_result_t){.node = ast, .next = next};
+}
+
+parse_result_t parse_token(tokenlist_entry_t *current,
+                           lexer_token_id_t token_id, node_id_t ast_id,
+                           token_validator_t is_valid) {
+    if (current->token.id != token_id ||
+        (is_valid && !is_valid(&current->token)))
+        return parse_no_match();
+
+    ast_node_t *node;
+    error_t *err = ast_node_alloc(&node);
+    if (err)
+        return parse_error(err);
+    node->id = ast_id;
+    node->token_entry = current;
+
+    return parse_success(node, current->next);
+}
+
+parse_result_t parse_result_wrap(node_id_t id, parse_result_t result) {
+    if (result.err)
+        return result;
+
+    ast_node_t *node;
+    error_t *err = ast_node_alloc(&node);
+    if (err) {
+        ast_node_free(result.node);
+        return parse_error(err);
+    }
+    node->id = id;
+
+    err = ast_node_add_child(node, result.node);
+    if (err) {
+        ast_node_free(result.node);
+        return parse_error(err);
+    }
+
+    return parse_success(node, result.next);
+}
--- a/src/parser/util.h
+++ b/src/parser/util.h
@@ -0,0 +1,26 @@
+#ifndef INCLUDE_PARSER_UTIL_H_
+#define INCLUDE_PARSER_UTIL_H_
+
+#include "../ast.h"
+#include "../error.h"
+#include "../tokenlist.h"
+
+typedef struct parse_result {
+    error_t *err;
+    tokenlist_entry_t *next;
+    ast_node_t *node;
+} parse_result_t;
+
+typedef bool (*token_validator_t)(lexer_token_t *);
+
+parse_result_t parse_error(error_t *err);
+parse_result_t parse_no_match();
+parse_result_t parse_success(ast_node_t *ast, tokenlist_entry_t *next);
+parse_result_t parse_token(tokenlist_entry_t *current,
+                           lexer_token_id_t token_id, node_id_t ast_id,
+                           token_validator_t is_valid);
+parse_result_t parse_result_wrap(node_id_t id, parse_result_t result);
+
+extern error_t *err_parse_no_match;
+
+#endif // INCLUDE_PARSER_UTIL_H_
--- a/src/tokenlist.c
+++ b/src/tokenlist.c
@@ -0,0 +1,106 @@
+#include "tokenlist.h"
+#include "error.h"
+#include "lexer.h"
+#include <stdlib.h>
+
+error_t *tokenlist_alloc(tokenlist_t **output) {
+    *output = nullptr;
+
+    tokenlist_t *list = calloc(1, sizeof(tokenlist_t));
+    if (list == nullptr)
+        return err_allocation_failed;
+
+    list->head = nullptr;
+    list->tail = nullptr;
+
+    *output = list;
+    return nullptr;
+}
+
+error_t *tokenlist_entry_alloc(tokenlist_entry_t **output) {
+    *output = nullptr;
+
+    tokenlist_entry_t *entry = calloc(1, sizeof(tokenlist_entry_t));
+    if (entry == nullptr)
+        return err_allocation_failed;
+
+    entry->next = nullptr;
+    entry->prev = nullptr;
+
+    *output = entry;
+    return nullptr;
+}
+
+void tokenlist_append(tokenlist_t *list, tokenlist_entry_t *entry) {
+    if (list->head == nullptr) {
+        list->head = entry;
+        list->tail = entry;
+        entry->next = nullptr;
+        entry->prev = nullptr;
+    } else {
+        entry->prev = list->tail;
+        entry->next = nullptr;
+        list->tail->next = entry;
+        list->tail = entry;
+    }
+}
+
+void tokenlist_entry_free(tokenlist_entry_t *entry) {
+    lexer_token_cleanup(&entry->token);
+    free(entry);
+}
+
+void tokenlist_free(tokenlist_t *list) {
+    if (list == nullptr)
+        return;
+
+    tokenlist_entry_t *current = list->head;
+    while (current) {
+        tokenlist_entry_t *next = current->next;
+        tokenlist_entry_free(current);
+        current = next;
+    }
+
+    free(list);
+}
+
+error_t *tokenlist_fill(tokenlist_t *list, lexer_t *lex) {
+    error_t *err = nullptr;
+    lexer_token_t token = {};
+    while ((err = lexer_next(lex, &token)) == nullptr) {
+        tokenlist_entry_t *entry;
+        err = tokenlist_entry_alloc(&entry);
+        if (err) {
+            lexer_token_cleanup(&token);
+            return err;
+        }
+        entry->token = token;
+        tokenlist_append(list, entry);
+    }
+    if (err != err_eof)
+        return err;
+    return nullptr;
+}
+
+bool is_trivia(tokenlist_entry_t *trivia) {
+    switch (trivia->token.id) {
+    case TOKEN_WHITESPACE:
+    case TOKEN_COMMENT:
+    case TOKEN_NEWLINE:
+        return true;
+    default:
+        return false;
+    }
+}
+
+tokenlist_entry_t *tokenlist_skip_trivia(tokenlist_entry_t *current) {
+    while (current && is_trivia(current))
+        current = current->next;
+    return current;
+}
+
+tokenlist_entry_t *tokenlist_next(tokenlist_entry_t *current) {
+    if (!current)
+        return nullptr;
+    return tokenlist_skip_trivia(current->next);
+}
--- a/src/tokenlist.h
+++ b/src/tokenlist.h
@@ -0,0 +1,40 @@
+#ifndef INCLUDE_SRC_TOKENLIST_H_
+#define INCLUDE_SRC_TOKENLIST_H_
+#include "lexer.h"
+
+typedef struct tokenlist_entry tokenlist_entry_t;
+
+struct tokenlist_entry {
+    lexer_token_t token;
+    tokenlist_entry_t *next;
+    tokenlist_entry_t *prev;
+};
+
+typedef struct tokenlist {
+    tokenlist_entry_t *head;
+    tokenlist_entry_t *tail;
+} tokenlist_t;
+
+/**
+ * @brief Allocate a new doubly linked list of lexer tokens
+ */
+error_t *tokenlist_alloc(tokenlist_t **list);
+
+/**
+ * Consume all tokens from the lexer and add them to the list
+ */
+error_t *tokenlist_fill(tokenlist_t *list, lexer_t *lex);
+
+void tokenlist_free(tokenlist_t *list);
+
+/**
+ * Return the first token entry that isn't whitespace, newline or comment
+ */
+tokenlist_entry_t *tokenlist_skip_trivia(tokenlist_entry_t *current);
+
+/**
+ * Return the next token entry that isn't whitespace, newline or comment
+ */
+tokenlist_entry_t *tokenlist_next(tokenlist_entry_t *current);
+
+#endif // INCLUDE_SRC_TOKENLIST_H_
--- a/tests/input/valid.asm
+++ b/tests/input/valid.asm
@@ -1,5 +1,17 @@
+.section text
+
+; Small valid code snippet that should contain all different AST nodes
+
 _start:
-    mov eax, 555            ; move 555 into eax
+    mov eax, ebx
+    lea eax, [eax + ebx * 4 + 8]
+    lea eax, [eax + 8]
+    lea eax, [eax + ebx * 8]
+    lea eax, [esp - 24]
+    lea eax, [eax + ebx * 4 - 8]
+    lea eax, [_start]
+    mov eax, _start
+    mov eax, 555
    push 0o777
    xor eax, 0xDEADBEEF
    and ecx, 0o770
--- a/validate.sh
+++ b/validate.sh
@@ -10,7 +10,7 @@ scan-build -o reports/static-analysis/ -plist-html --status-bugs make all
 # Run the sanitizer builds and valgrind
 make clean sanitize all

-ARGUMENTS=("-tokens" "-text")
+ARGUMENTS=("tokens" "text" "ast")
 while IFS= read -r INPUT_FILE; do
    for ARGS in ${ARGUMENTS[@]}; do
        ./oas-asan $ARGS $INPUT_FILE > /dev/null
Author	SHA1	Message	Date
omicron	3db9fd9b8f	Added object file format spec All checks were successful Validate the build / validate-build (push) Successful in 25s Details This is a WIP.	2025-04-03 00:49:31 +02:00
omicron	0d3881f680	Update the test input file to contain all AST nodes All checks were successful Validate the build / validate-build (push) Successful in 36s Details	2025-04-02 21:41:27 +02:00
omicron	5ea942024f	add functionality to main to parse and print the ast	2025-04-02 20:57:02 +02:00
omicron	b4757e008c	Add parse_result_wrap to wrap a result with another parent node Use the new wrap function to wrap numbers and immediate nodes	2025-04-02 20:57:02 +02:00
omicron	b70b6896bf	Partial parser implementation	2025-04-02 20:56:59 +02:00
omicron	6ca7bb3661	Fix incorrect size comparison in lexer_consume_n The buffer length len and the requested number of tokens n are mixed up in an invalid comparison. This causes all valid requests for n < len tokens to be denied and all invalid requests for n > len tokens to be accepted. This may cause a buffer overflow if the caller requests more characters than they provide space for.	2025-04-02 20:41:49 +02:00
omicron	d424c0f886	Add a parser combinator to parse a delimited list	2025-04-02 20:41:49 +02:00
omicron	c66489dd90	Add basic parser combinators	2025-04-02 20:41:49 +02:00
omicron	44fa66c2b7	Add "primitive" parsers for all the non-trivia tokens in the lexer grammar	2025-04-02 20:41:42 +02:00
omicron	c48adb1306	Add basic parser utilities	2025-04-02 20:38:35 +02:00
omicron	5fb6ebef28	Add functions to skip over trivia in a tokenlist	2025-04-02 11:59:24 +02:00
omicron	bbdcad024f	Add function to print the AST	2025-04-02 11:50:25 +02:00
omicron	935da30257	Add basic AST functionality	2025-04-02 11:35:53 +02:00
omicron	34ace36920	Add a parser grammar	2025-04-02 11:28:58 +02:00
omicron	bd37ddaeea	Add tokenlist, a linked list of lexer tokens The linked list is doubly linked so the parser can look forward into it and error reporting can look backward. This commmit also reworks main to use the tokenlist instead of dealing with the lexer manually.	2025-03-31 18:43:34 +02:00
omicron	42da7b1d05	Move err_allocation_failed into error.c and make it available to everyone.	2025-03-31 18:43:34 +02:00
omicron	75fc72c35d	Add action to run validation on every commit All checks were successful Validate the build / validate-build (push) Successful in 23s Details Adds some flags to the makefile to make it build on alpine with a different libc	2025-03-31 14:36:15 +02:00
omicron	5cdb60d395	Remove peek function	2025-03-30 22:51:47 +02:00
omicron	e5830daac9	Add documentation comments to the lexer code	2025-03-30 22:51:15 +02:00
omicron	4becfb868e	Reduce excessive line length in Makefile	2025-03-30 22:07:35 +02:00