Implement printing the encoding in main

Incomplete second pass encoding
Add bytes type and tests
2025-04-16 23:10:17 +02:00 · 2025-04-16 23:10:09 +02:00 · 2025-04-16 23:10:09 +02:00 · 2025-04-16 23:10:09 +02:00 · 2025-04-16 23:10:00 +02:00 · 2025-04-16 23:10:00 +02:00
18 changed files with 816 additions and 21 deletions
--- a/doc/parser_grammar.txt
+++ b/doc/parser_grammar.txt
@ -1,9 +1,9 @@
 <program>   ::= <statement>*
-<statement> ::= <label> | <directive> | <instruction>
+<statement> ::= <label> | <directive> | <instruction> | <newline>

 <label> ::= <identifier> <colon>

-<directive> ::= <dot> (<section_directive> | <export_directive> | <import_directive> )
+<directive> ::= <dot> (<section_directive> | <export_directive> | <import_directive> ) <newline>

 <section_directive> ::= "section" <identifier>

@ -11,7 +11,7 @@

 <import_directive> ::= "import" <identifier>

-<instruction> ::= <identifier> <operands>
+<instruction> ::= <identifier> <operands> <newline>

 <operands> ::= <operand> ( <comma> <operand> )*

--- a/src/ast.c
+++ b/src/ast.c
@ -161,6 +161,8 @@ const char *ast_node_id_to_cstr(node_id_t id) {
        return "NODE_ASTERISK";
    case NODE_DOT:
        return "NODE_DOT";
+    case NODE_NEWLINE:
+        return "NODE_NEWLINE";
    case NODE_IMPORT:
        return "NODE_IMPORT";
    case NODE_EXPORT:
@ -180,7 +182,8 @@ static void ast_node_print_internal(ast_node_t *node, int indent) {
    }
    printf("%s", ast_node_id_to_cstr(node->id));

-    if (node->token_entry && node->token_entry->token.value) {
+    if (node->token_entry && node->token_entry->token.value &&
+        node->id != NODE_NEWLINE) {
        printf(" \"%s\"", node->token_entry->token.value);
    }
    printf("\n");
@ -193,3 +196,18 @@ static void ast_node_print_internal(ast_node_t *node, int indent) {
 void ast_node_print(ast_node_t *node) {
    ast_node_print_internal(node, 0);
 }
+
+void ast_node_prune(ast_node_t *node, node_id_t id) {
+    size_t new_len = 0;
+    for (size_t i = 0; i < node->len; i++) {
+        auto child = node->children[i];
+        if (child->id == id) {
+            ast_node_free(child);
+            continue;
+        }
+        ast_node_prune(child, id);
+        node->children[new_len] = child;
+        new_len++;
+    }
+    node->len = new_len;
+}
--- a/src/ast.h
+++ b/src/ast.h
@ -55,6 +55,7 @@ typedef enum node_id {
    NODE_MINUS,
    NODE_ASTERISK,
    NODE_DOT,
+    NODE_NEWLINE,
 } node_id_t;

 typedef struct ast_node ast_node_t;
@ -73,6 +74,11 @@ typedef struct register_ {
    operand_size_t size;
 } register_t;

+typedef struct opcode_encoding {
+    uint8_t encoding[32];
+    size_t len;
+} opcode_encoding_t;
+
 struct ast_node {
    node_id_t id;
    tokenlist_entry_t *token_entry;
@ -83,6 +89,7 @@ struct ast_node {
    union {
        register_t reg;
        number_t number;
+        opcode_encoding_t encoding;
    } value;
 };

@ -131,4 +138,17 @@ error_t *ast_node_add_child(ast_node_t *node, ast_node_t *child);
 */
 void ast_node_print(ast_node_t *node);

+/**
+ * Prune the children with a given id
+ *
+ * The tree is recursively visited and all child nodes of a given ID are pruned
+ * completely. If a node has the giver id, it will get removed along wih all its
+ * children, even if some of those children have different ids. The root node id
+ * is never checked so the tree is guaranteed to remain and allocated valid.
+ *
+ * @param node The root of the tree you want to prune
+ * @param id The id of the nodes you want to prune
+ */
+void ast_node_prune(ast_node_t *node, node_id_t id);
+
 #endif // INCLUDE_SRC_AST_H_
--- a/src/bytes.c
+++ b/src/bytes.c
@ -0,0 +1,6 @@
+#include "bytes.h"
+#include "error.h"
+
+error_t *const err_bytes_no_capacity = &(error_t){
+    .message = "Not enough capacity in bytes buffer",
+};
--- a/src/bytes.h
+++ b/src/bytes.h
@ -0,0 +1,60 @@
+#ifndef INCLUDE_SRC_BYTES_H_
+#define INCLUDE_SRC_BYTES_H_
+
+#include "error.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+extern error_t *const err_bytes_no_capacity;
+
+typedef struct bytes {
+    size_t len;
+    size_t cap;
+    uint8_t buffer[];
+} bytes_t;
+
+#define LOCAL_BYTES_ANONYMOUS(N)                                               \
+    &(struct {                                                                 \
+        size_t len;                                                            \
+        size_t cap;                                                            \
+        uint8_t buffer[(N)];                                                   \
+    }) {                                                                       \
+        0, (N), {}                                                             \
+    }
+
+#define LOCAL_BYTES(N) (bytes_t *)LOCAL_BYTES_ANONYMOUS(N);
+
+static inline error_t *bytes_append_uint8(bytes_t *bytes, uint8_t value) {
+    if (bytes->len >= bytes->cap)
+        return err_bytes_no_capacity;
+    bytes->buffer[bytes->len++] = value;
+    return nullptr;
+}
+
+static inline error_t *bytes_append_array(bytes_t *dst, size_t n,
+                                          uint8_t buffer[static n]) {
+    if (dst->len + n >= dst->cap)
+        return err_bytes_no_capacity;
+    memcpy(dst->buffer + dst->len, buffer, n);
+    dst->len += n;
+    return nullptr;
+}
+
+static inline error_t *bytes_append_bytes(bytes_t *dst, bytes_t *src) {
+    return bytes_append_array(dst, src->len, src->buffer);
+}
+
+static inline error_t *bytes_append_uint16(bytes_t *dst, uint16_t value) {
+    return bytes_append_array(dst, sizeof(value), (uint8_t *)&value);
+}
+
+static inline error_t *bytes_append_uint32(bytes_t *dst, uint32_t value) {
+    return bytes_append_array(dst, sizeof(value), (uint8_t *)&value);
+}
+
+static inline error_t *bytes_append_uint64(bytes_t *dst, uint64_t value) {
+    return bytes_append_array(dst, sizeof(value), (uint8_t *)&value);
+}
+
+#endif // INCLUDE_SRC_BYTES_H_
--- a/src/data/opcodes.c
+++ b/src/data/opcodes.c
@ -51,7 +51,7 @@ opcode_data_t *const opcodes[] = {
            { .kind = OPERAND_IMMEDIATE, .size = OPERAND_SIZE_32},
        },
    },
-    // Push reg16, 
+    // PUSH reg16, 
    &(opcode_data_t) {
        .mnemonic = "push",
        .opcode = 0x50,
@ -59,9 +59,86 @@ opcode_data_t *const opcodes[] = {
        .encoding_class = ENCODING_OPCODE_REGISTER,
        .operand_count = 1,
        .operands = {
-            { .kind = OPERAND_REGISTER, .size = OPERAND_SIZE_16 | OPERAND_SIZE_32 | OPERAND_SIZE_64 },
+            { .kind = OPERAND_REGISTER, .size = OPERAND_SIZE_16 },
        },
    },
+    // PUSH reg64
+    &(opcode_data_t) {
+        .mnemonic = "push",
+        .opcode = 0x50,
+        .opcode_extension = opcode_extension_none,
+        .encoding_class = ENCODING_OPCODE_REGISTER,
+        .operand_count = 1,
+        .operands = {
+            { .kind = OPERAND_REGISTER, .size = OPERAND_SIZE_64 },
+        },
+    },
+    // NOT reg16
+    &(opcode_data_t) {
+        .mnemonic = "not",
+        .opcode = 0xF7,
+        .opcode_extension = 2,
+        .operand_size_prefix = true,
+        .operand_count = 1,
+        .operands = {
+            { .kind = OPERAND_REGISTER, .size = OPERAND_SIZE_16 },
+        },
+    },
+    // NOT reg32
+    &(opcode_data_t) {
+        .mnemonic = "not",
+        .opcode = 0xF7,
+        .opcode_extension = 2,
+        .operand_count = 1,
+        .operands = {
+            { .kind = OPERAND_REGISTER, .size = OPERAND_SIZE_32 },
+        },
+    },
+    // NOT reg64
+    &(opcode_data_t) {
+        .mnemonic = "not",
+        .opcode = 0xF7,
+        .opcode_extension = 2,
+        .rex_w_prefix = true,
+        .operand_count = 1,
+        .operands = {
+            { .kind = OPERAND_REGISTER, .size = OPERAND_SIZE_64 },
+        },
+    },
+
+    // NEG reg16
+    &(opcode_data_t) {
+        .mnemonic = "neg",
+        .opcode = 0xF7,
+        .opcode_extension = 3,
+        .operand_size_prefix = true,
+        .operand_count = 1,
+        .operands = {
+            { .kind = OPERAND_REGISTER, .size = OPERAND_SIZE_16 },
+        },
+    },
+    // NEG reg32
+    &(opcode_data_t) {
+        .mnemonic = "neg",
+        .opcode = 0xF7,
+        .opcode_extension = 3,
+        .operand_count = 1,
+        .operands = {
+            { .kind = OPERAND_REGISTER, .size = OPERAND_SIZE_32 },
+        },
+    },
+    // NEG reg64
+    &(opcode_data_t) {
+        .mnemonic = "neg",
+        .opcode = 0xF7,
+        .opcode_extension = 3,
+        .rex_w_prefix = true,
+        .operand_count = 1,
+        .operands = {
+            { .kind = OPERAND_REGISTER, .size = OPERAND_SIZE_64 },
+        },
+    },
+

    nullptr,
 };
--- a/src/encoder/encoder.c
+++ b/src/encoder/encoder.c
@ -1,4 +1,5 @@
 #include "encoder.h"
+#include "../bytes.h"
 #include "../data/opcodes.h"
 #include "symbols.h"
 #include <assert.h>
@ -15,6 +16,12 @@ error_t *const err_encoder_invalid_size_suffix =
    &(error_t){.message = "Invalid number size suffix"};
 error_t *const err_encoder_unknown_symbol_reference =
    &(error_t){.message = "Referenced an unknown symbol"};
+error_t *const err_encoder_no_encoding_found =
+    &(error_t){.message = "No encoding found for instruction"};
+error_t *const err_encoder_not_implemented =
+    &(error_t){.message = "Implementation for this opcode is missing"};
+error_t *const err_encoder_unexpected_length =
+    &(error_t){.message = "Unexpectedly long encoding"};

 error_t *encoder_alloc(encoder_t **output) {
    *output = nullptr;
@ -156,8 +163,54 @@ error_t *encoder_set_register_value(ast_node_t *node) {
        if (strcmp(value, registers[i]->name) == 0) {
            node->value.reg.id = registers[i]->id;
            node->value.reg.size = registers[i]->size;
+            return nullptr;
        }
    }
+    return err_encoder_invalid_register;
+}
+
+/**
+ * Set the opcode extension in the modrm field
+ */
+static inline uint8_t modrm_extension(uint8_t modrm, uint8_t extension) {
+    assert(extension != opcode_extension_none);
+    assert((extension & 0b111) == extension);
+    return (modrm & ~modrm_reg_mask) | extension << 3;
+}
+
+/**
+ * Return the rex bit for reg field in modrm
+ */
+static inline uint8_t modrm_reg_rex(uint8_t rex, register_id_t id) {
+    if (id & 0b1000)
+        rex |= rex_prefix_r;
+    return rex;
+}
+
+/**
+ * update modrm reg field with the given register, must be used alongside
+ * modrm_reg_rex
+ */
+static inline uint8_t modrm_reg(uint8_t modrm, register_id_t id) {
+    return (modrm & ~modrm_reg_mask) | (id & 0b111) << 3;
+}
+
+/**
+ * Return the rex bit for rm field in modrm
+ */
+static inline uint8_t modrm_rm_rex(uint8_t rex, register_id_t id) {
+    if (id & 0b1000)
+        rex |= rex_prefix_b;
+    return rex;
+}
+
+/**
+ * update modrm rm field with the given register, must be used alongside
+ * modrm_rm_rex
+ */
+static inline uint8_t modrm_rm(uint8_t modrm, register_id_t id) {
+    assert((modrm & modrm_mod_mask) == modrm_mod_register);
+    return (modrm & ~modrm_rm_mask) | (id & 0b111);
 }

 /**
@ -185,8 +238,272 @@ error_t *encoder_first_pass(encoder_t *encoder, ast_node_t *node) {
    return nullptr;
 }

+bool is_operand_match(operand_info_t *info, ast_node_t *operand) {
+    switch (info->kind) {
+    case OPERAND_REGISTER:
+        return operand->id == NODE_REGISTER &&
+               operand->value.reg.size == info->size;
+    case OPERAND_MEMORY:
+        return operand->id == NODE_MEMORY;
+    case OPERAND_IMMEDIATE: {
+        if (operand->id != NODE_IMMEDIATE)
+            return false;
+        ast_node_t *child = operand->children[0];
+
+        if (child->id == NODE_NUMBER)
+            return (child->value.number.size & info->size) > 0;
+        else if (child->id == NODE_LABEL_REFERENCE)
+            return info->size == OPERAND_SIZE_32;
+        // FIXME: first pass should give us information about the distance of
+        // the label reference so we can pick a size more appropriately instead
+        // of just defaulting to 32 bits
+        break;
+    } // end OPERAND_IMMEDIATE case
+    }
+    assert(false && "unreachable");
+    __builtin_unreachable();
+}
+
+bool is_opcode_match(opcode_data_t *opcode, const char *mnemonic,
+                     ast_node_t *operands) {
+    if (strcmp(opcode->mnemonic, mnemonic) != 0)
+        return false;
+
+    if (opcode->operand_count != operands->len)
+        return false;
+
+    for (size_t i = 0; i < operands->len; ++i) {
+        if (!is_operand_match(&opcode->operands[i], operands->children[i]))
+            return false;
+    }
+
+    return true;
+}
+
+error_t *encoder_get_opcode_data(ast_node_t *instruction, ast_node_t *operands,
+                                 opcode_data_t **opcode_out) {
+    const char *mnemonic = instruction->children[0]->token_entry->token.value;
+
+    for (size_t i = 0; opcodes[i]; ++i) {
+        opcode_data_t *opcode = opcodes[i];
+        if (is_opcode_match(opcode, mnemonic, operands)) {
+            *opcode_out = opcode;
+            return nullptr;
+        }
+    }
+    return err_encoder_no_encoding_found;
+}
+
+error_t *encode_two_operand(encoder_t *encoder, opcode_data_t *opcode,
+                            ast_node_t *operands, bytes_t *encoding,
+                            uint8_t *rex) {
+    (void)encoder;
+    (void)opcode;
+    (void)operands;
+    (void)encoding;
+    (void)rex;
+    assert(encoding->len >= 1 && "must have 1+ opcode byte in buffer already");
+    return err_encoder_not_implemented;
+}
+
+error_t *encode_one_register_in_opcode(encoder_t *encoder,
+                                       opcode_data_t *opcode,
+                                       ast_node_t *operands, bytes_t *encoding,
+                                       uint8_t *rex) {
+    (void)encoder;
+    (void)opcode;
+
+    register_id_t id = operands->children[0]->value.reg.id;
+    encoding->buffer[encoding->len - 1] |= id & 0b111;
+    if ((id & 0b1000) > 0) {
+        *rex |= rex_prefix_r;
+    }
+    return nullptr;
+}
+
+error_t *encode_one_register(encoder_t *encoder, opcode_data_t *opcode,
+                             ast_node_t *operands, bytes_t *encoding,
+                             uint8_t *rex) {
+    (void)encoder;
+    assert(operands->len == 1);
+    assert(operands->children[0]->id == NODE_REGISTER);
+
+    register_id_t id = operands->children[0]->value.reg.id;
+
+    uint8_t modrm = modrm_mod_register;
+
+    if (opcode->opcode_extension != opcode_extension_none) {
+        // register goes in rm field, extension goes in mod field
+        modrm = modrm_extension(modrm, opcode->opcode_extension);
+        modrm = modrm_rm(modrm, id);
+        *rex = modrm_rm_rex(*rex, id);
+    } else {
+        // register goes in reg field
+        // NOTE:
+        // it's actually likely this case just doesn't exist at all and all
+        // opcodes that take one register in modr/m _all_ have extended opcdes
+        modrm = modrm_reg(modrm, id);
+        *rex = modrm_reg_rex(*rex, id);
+    }
+
+    return bytes_append_uint8(encoding, modrm);
+}
+
+error_t *encode_one_immediate(encoder_t *encoder, opcode_data_t *opcode,
+                              ast_node_t *operands, bytes_t *encoding,
+                              uint8_t *rex) {
+    (void)encoder;
+    (void)opcode;
+    (void)rex;
+    assert(operands->len == 1);
+    assert(operands->children[0]->id == NODE_IMMEDIATE);
+    assert(operands->children[0]->len == 1);
+    ast_node_t *immediate = operands->children[0]->children[0];
+    assert(immediate->id == NODE_NUMBER ||
+           immediate->id == NODE_LABEL_REFERENCE);
+
+    if (immediate->id == NODE_NUMBER) {
+        uint64_t value = immediate->value.number.value;
+        operand_size_t size = opcode->operands[0].size;
+        error_t *err = nullptr;
+        switch (size) {
+        case OPERAND_SIZE_8:
+            err = bytes_append_uint8(encoding, value);
+            break;
+        case OPERAND_SIZE_16:
+            err = bytes_append_uint16(encoding, value);
+            break;
+        case OPERAND_SIZE_32:
+            err = bytes_append_uint32(encoding, value);
+            break;
+        case OPERAND_SIZE_64:
+            err = bytes_append_uint64(encoding, value);
+            break;
+        default:
+            assert(false && "intentionally unhandled");
+        }
+        return err;
+    } else {
+        // FIXME: this still assumes references are always 32 bit
+        uint32_t value = 0xDEADBEEF;
+        return bytes_append_uint32(encoding, value);
+    }
+}
+
+error_t *encode_one_memory(encoder_t *encoder, opcode_data_t *opcode,
+                           ast_node_t *operands, bytes_t *encoding,
+                           uint8_t *rex) {
+    (void)encoder;
+    (void)opcode;
+    (void)operands;
+    (void)encoding;
+    (void)rex;
+    return err_encoder_not_implemented;
+}
+
+error_t *encode_one_operand(encoder_t *encoder, opcode_data_t *opcode,
+                            ast_node_t *operands, bytes_t *encoding,
+                            uint8_t *rex) {
+    switch (opcode->operands[0].kind) {
+    case OPERAND_REGISTER:
+        if (opcode->encoding_class == ENCODING_OPCODE_REGISTER)
+            return encode_one_register_in_opcode(encoder, opcode, operands,
+                                                 encoding, rex);
+        else
+            return encode_one_register(encoder, opcode, operands, encoding,
+                                       rex);
+    case OPERAND_MEMORY:
+        return encode_one_memory(encoder, opcode, operands, encoding, rex);
+    case OPERAND_IMMEDIATE:
+        return encode_one_immediate(encoder, opcode, operands, encoding, rex);
+    }
+}
+
+error_t *encoder_encode_instruction(encoder_t *encoder,
+                                    ast_node_t *instruction) {
+    ast_node_t *operands = instruction->children[1];
+
+    opcode_data_t *opcode = nullptr;
+    error_t *err = encoder_get_opcode_data(instruction, operands, &opcode);
+    if (err)
+        return err;
+
+    uint8_t rex = 0;
+    bytes_t *encoding = LOCAL_BYTES(32);
+
+    if (opcode->opcode > 0xFF &&
+        (err = bytes_append_uint8(encoding, opcode->opcode >> 8)))
+        return err;
+    if ((err = bytes_append_uint8(encoding, opcode->opcode & 0xFF)))
+        return err;
+
+    // NOTE:operand encoders all expect the opcode to be in the buffer already.
+    // Some of them rely on this to encode the register value in the opcode
+    // byte.
+    switch (opcode->operand_count) {
+    case 0:
+        break;
+    case 1:
+        err = encode_one_operand(encoder, opcode, operands, encoding, &rex);
+        break;
+    case 2:
+        err = encode_two_operand(encoder, opcode, operands, encoding, &rex);
+        break;
+    default:
+        err = err_encoder_not_implemented;
+    }
+    if (err)
+        return err;
+
+    // produce the actual encoding output in the NODE_INSTRUCTION value
+    uint8_t *output = instruction->value.encoding.encoding;
+    size_t output_len = 0;
+
+    // Handle prefixes
+    if (opcode->rex_w_prefix)
+        rex = rex_prefix_w;
+    if (opcode->address_size_prefix)
+        output[output_len++] = memory_size_prefix;
+    if (opcode->operand_size_prefix)
+        output[output_len++] = operand_size_prefix;
+    if (rex > 0)
+        output[output_len++] = rex;
+
+    // copy the encoded opcode and operands
+    if (encoding->len > 20)
+        return err_encoder_unexpected_length;
+    memcpy(output + output_len, encoding->buffer, encoding->len);
+    output_len += encoding->len;
+
+    instruction->value.encoding.len = output_len;
+
+    return nullptr;
+}
+
+/**
+ * Perform the second pass that performs actual encoding. Will use
+ * placeholder values for label references because instruction size has not
+ * yet been determined.
+ */
+error_t *encoder_encoding_pass(encoder_t *encoder, ast_node_t *root) {
+    for (size_t i = 0; i < root->len; ++i) {
+        if (root->children[i]->id != NODE_INSTRUCTION)
+            continue;
+        ast_node_t *instruction = root->children[i];
+        error_t *err = encoder_encode_instruction(encoder, instruction);
+        if (err)
+            return err;
+    }
+    return nullptr;
+}
+
 opcode_data_t *encoder_find_opcode(ast_node_t *instruction) {
    for (size_t i = 0; opcodes[i] != nullptr; ++i) {
+        const char *mnemonic =
+            instruction->children[0]->token_entry->token.value;
+        ast_node_t *operands = instruction->children[1];
+        if (is_opcode_match(opcodes[i], mnemonic, operands))
+            return opcodes[i];
    }
    return nullptr;
 }
@ -205,5 +522,5 @@ error_t *encoder_encode(encoder_t *encoder, ast_node_t *ast) {
    err = encoder_check_symbols(encoder);
    if (err)
        return err;
-    return nullptr;
+    return encoder_encoding_pass(encoder, ast);
 }
--- a/src/encoder/encoder.h
+++ b/src/encoder/encoder.h
@ -7,6 +7,15 @@ typedef struct encoder {
    symbol_table_t *symbols;
 } encoder_t;

+constexpr uint8_t modrm_mod_memory = 0b00'000'000;
+constexpr uint8_t modrm_mod_memory_displacement8 = 0b01'000'000;
+constexpr uint8_t modrm_mod_memory_displacement32 = 0b10'000'000;
+constexpr uint8_t modrm_mod_register = 0b11'000'000;
+
+constexpr uint8_t modrm_reg_mask = 0b00'111'000;
+constexpr uint8_t modrm_rm_mask = 0b00'000'111;
+constexpr uint8_t modrm_mod_mask = 0b11'000'000;
+
 error_t *encoder_alloc(encoder_t **encoder);
 error_t *encoder_encode(encoder_t *encoder, ast_node_t *ast);
 void encoder_free(encoder_t *encoder);
@ -16,5 +25,8 @@ extern error_t *const err_encoder_number_overflow;
 extern error_t *const err_encoder_invalid_number_format;
 extern error_t *const err_encoder_invalid_size_suffix;
 extern error_t *const err_encoder_unknown_symbol_reference;
+extern error_t *const err_encoder_no_encoding_found;
+extern error_t *const err_encoder_not_implemented;
+extern error_t *const err_encoder_unexpected_length;

 #endif // INCLUDE_ENCODER_ENCODER_H_
--- a/src/main.c
+++ b/src/main.c
@ -1,3 +1,5 @@
+#include "ast.h"
+#include "encoder/encoder.h"
 #include "error.h"
 #include "lexer.h"
 #include "parser/parser.h"
@ -8,7 +10,13 @@
 #include <stdlib.h>
 #include <string.h>

-typedef enum mode { MODE_AST, MODE_TEXT, MODE_TOKENS } mode_t;
+typedef enum mode {
+    MODE_INVALID = -1,
+    MODE_AST,
+    MODE_TEXT,
+    MODE_TOKENS,
+    MODE_ENCODING,
+} mode_t;

 void print_tokens(tokenlist_t *list) {
    for (auto entry = list->head; entry; entry = entry->next) {
@ -50,18 +58,61 @@ error_t *print_ast(tokenlist_t *list) {
    return nullptr;
 }

-int get_execution_mode(int argc, char *argv[]) {
-    if (argc != 3 || (strcmp(argv[1], "tokens") != 0 &&
-                      strcmp(argv[1], "text") != 0 && strcmp(argv[1], "ast"))) {
-        puts("Usage: oas [tokens|text|ast] <filename>");
-        exit(1);
+void print_hex(size_t len, uint8_t bytes[static len]) {
+    for (size_t i = 0; i < len; i++) {
+        printf("%02x", bytes[i]);
+        if (i < len - 1) {
+            printf(" ");
+        }
    }
+    printf("\n");
+}
+
+error_t *print_encoding(tokenlist_t *list) {
+    parse_result_t result = parse(list->head);
+    if (result.err)
+        return result.err;
+
+    encoder_t *encoder;
+    error_t *err = encoder_alloc(&encoder);
+    if (err)
+        goto cleanup_ast;
+
+    err = encoder_encode(encoder, result.node);
+    if (err)
+        goto cleanup_ast;
+
+    ast_node_t *root = result.node;
+    for (size_t i = 0; i < root->len; ++i) {
+        ast_node_t *node = root->children[i];
+        if (node->id != NODE_INSTRUCTION)
+            continue;
+
+        print_hex(node->value.encoding.len, node->value.encoding.encoding);
+    }
+
+    encoder_free(encoder);
+    ast_node_free(result.node);
+    return nullptr;
+
+cleanup_ast:
+    ast_node_free(result.node);
+    return err;
+}
+
+int get_execution_mode(int argc, char *argv[]) {
+    if (argc != 3)
+        return MODE_INVALID;

    if (strcmp(argv[1], "tokens") == 0)
        return MODE_TOKENS;
    if (strcmp(argv[1], "text") == 0)
        return MODE_TEXT;
-    return MODE_AST;
+    if (strcmp(argv[1], "ast") == 0)
+        return MODE_AST;
+    if (strcmp(argv[1], "encoding") == 0)
+        return MODE_ENCODING;
+    return MODE_INVALID;
 }

 error_t *do_action(mode_t mode, tokenlist_t *list) {
@ -74,12 +125,20 @@ error_t *do_action(mode_t mode, tokenlist_t *list) {
        return nullptr;
    case MODE_AST:
        return print_ast(list);
+    case MODE_ENCODING:
+        return print_encoding(list);
+    case MODE_INVALID:
+        /* can't happen */
    }
    __builtin_unreachable();
 }

 int main(int argc, char *argv[]) {
    mode_t mode = get_execution_mode(argc, argv);
+    if (mode == MODE_INVALID) {
+        puts("Usage: oas [tokens|text|ast|encoding] <filename>");
+        exit(1);
+    }
    char *filename = argv[2];

    lexer_t *lex = &(lexer_t){};
--- a/src/parser/combinators.c
+++ b/src/parser/combinators.c
@ -1,4 +1,5 @@
 #include "combinators.h"
+#include "util.h"

 // Parse a list of the given parser delimited by the given token id. Does not
 // store the delimiters in the parent node
@ -122,5 +123,12 @@ parse_result_t parse_consecutive(tokenlist_entry_t *current, node_id_t id,
        }
        current = result.next;
    }
+
+    // token stream ended before we matched all parsers
+    if (parser != nullptr) {
+        ast_node_free(all);
+        return parse_no_match();
+    }
+
    return parse_success(all, current);
 }
--- a/src/parser/parser.c
+++ b/src/parser/parser.c
@ -136,22 +136,28 @@ parse_result_t parse_directive_options(tokenlist_entry_t *current) {
 }

 parse_result_t parse_directive(tokenlist_entry_t *current) {
-    parser_t parsers[] = {parse_dot, parse_directive_options, nullptr};
+    parser_t parsers[] = {parse_dot, parse_directive_options, parse_newline,
+                          nullptr};
    return parse_consecutive(current, NODE_DIRECTIVE, parsers);
 }

 parse_result_t parse_instruction(tokenlist_entry_t *current) {
-    parser_t parsers[] = {parse_identifier, parse_operands, nullptr};
+    parser_t parsers[] = {parse_identifier, parse_operands, parse_newline,
+                          nullptr};
    return parse_consecutive(current, NODE_INSTRUCTION, parsers);
 }

 parse_result_t parse_statement(tokenlist_entry_t *current) {
    parser_t parsers[] = {parse_label, parse_directive, parse_instruction,
-                          nullptr};
+                          parse_newline, nullptr};
    return parse_any(current, parsers);
 }

 parse_result_t parse(tokenlist_entry_t *current) {
    current = tokenlist_skip_trivia(current);
-    return parse_many(current, NODE_PROGRAM, true, parse_statement);
+    parse_result_t result =
+        parse_many(current, NODE_PROGRAM, true, parse_statement);
+    if (result.node != nullptr)
+        ast_node_prune(result.node, NODE_NEWLINE);
+    return result;
 }
--- a/src/parser/primitives.c
+++ b/src/parser/primitives.c
@ -63,6 +63,10 @@ parse_result_t parse_dot(tokenlist_entry_t *current) {
    return parse_token(current, TOKEN_DOT, NODE_DOT, nullptr);
 }

+parse_result_t parse_newline(tokenlist_entry_t *current) {
+    return parse_token(current, TOKEN_NEWLINE, NODE_NEWLINE, nullptr);
+}
+
 parse_result_t parse_label_reference(tokenlist_entry_t *current) {
    return parse_token(current, TOKEN_IDENTIFIER, NODE_LABEL_REFERENCE,
                       nullptr);
--- a/src/parser/primitives.h
+++ b/src/parser/primitives.h
@ -18,6 +18,7 @@ parse_result_t parse_plus(tokenlist_entry_t *current);
 parse_result_t parse_minus(tokenlist_entry_t *current);
 parse_result_t parse_asterisk(tokenlist_entry_t *current);
 parse_result_t parse_dot(tokenlist_entry_t *current);
+parse_result_t parse_newline(tokenlist_entry_t *current);
 parse_result_t parse_label_reference(tokenlist_entry_t *current);

 /* These are "primitives" with a different name and some extra validation on top
--- a/src/tokenlist.c
+++ b/src/tokenlist.c
@ -86,7 +86,6 @@ bool is_trivia(tokenlist_entry_t *trivia) {
    switch (trivia->token.id) {
    case TOKEN_WHITESPACE:
    case TOKEN_COMMENT:
-    case TOKEN_NEWLINE:
        return true;
    default:
        return false;
--- a/tests/bytes.c
+++ b/tests/bytes.c
@ -0,0 +1,164 @@
+#include "../src/bytes.h"
+#include "munit.h"
+
+MunitResult test_bytes_initializer(const MunitParameter params[], void *data) {
+    (void)params;
+    (void)data;
+    bytes_t *bytes = LOCAL_BYTES(16);
+    munit_assert_size(bytes->len, ==, 0);
+    munit_assert_size(bytes->cap, ==, 16);
+    for (size_t i = 0; i < 16; ++i)
+        munit_assert_uint8(bytes->buffer[i], ==, 0);
+    return MUNIT_OK;
+}
+
+MunitResult test_bytes_append_uint8(const MunitParameter params[], void *data) {
+    (void)params;
+    (void)data;
+    bytes_t *bytes = LOCAL_BYTES(16);
+    munit_assert_size(bytes->len, ==, 0);
+    munit_assert_size(bytes->cap, ==, 16);
+    for (size_t i = 0; i < 16; ++i) {
+        error_t *err = bytes_append_uint8(bytes, (uint8_t)i);
+        munit_assert_null(err);
+        munit_assert_uint8(bytes->buffer[i], ==, (uint8_t)i);
+    }
+
+    error_t *err = bytes_append_uint8(bytes, 0xFF);
+    munit_assert_ptr(err, ==, err_bytes_no_capacity);
+
+    return MUNIT_OK;
+}
+
+MunitResult test_bytes_append_array(const MunitParameter params[], void *data) {
+    (void)params;
+    (void)data;
+
+    bytes_t *bytes = LOCAL_BYTES(16);
+    munit_assert_size(bytes->len, ==, 0);
+    munit_assert_size(bytes->cap, ==, 16);
+
+    uint8_t test_array[] = {0x01, 0x02, 0x03, 0x04, 0x05};
+    size_t array_len = sizeof(test_array) / sizeof(test_array[0]);
+    error_t *err = bytes_append_array(bytes, array_len, test_array);
+    munit_assert_null(err);
+    munit_assert_size(bytes->len, ==, array_len);
+
+    for (size_t i = 0; i < array_len; ++i) {
+        munit_assert_uint8(bytes->buffer[i], ==, test_array[i]);
+    }
+
+    uint8_t second_array[] = {0x06, 0x07, 0x08};
+    size_t second_len = sizeof(second_array) / sizeof(second_array[0]);
+    err = bytes_append_array(bytes, second_len, second_array);
+    munit_assert_null(err);
+    munit_assert_size(bytes->len, ==, array_len + second_len);
+    for (size_t i = 0; i < second_len; ++i) {
+        munit_assert_uint8(bytes->buffer[array_len + i], ==, second_array[i]);
+    }
+
+    uint8_t overflow_array[10] = {0}; // Array that would exceed capacity
+    err = bytes_append_array(bytes, sizeof(overflow_array), overflow_array);
+    munit_assert_ptr(err, ==, err_bytes_no_capacity);
+    munit_assert_size(bytes->len, ==, array_len + second_len);
+
+    return MUNIT_OK;
+}
+
+MunitResult test_bytes_append_bytes(const MunitParameter params[], void *data) {
+    (void)params;
+    (void)data;
+
+    bytes_t *src = LOCAL_BYTES(8);
+    bytes_t *dst = LOCAL_BYTES(16);
+
+    // Fill source bytes with test data
+    for (uint8_t i = 0; i < 5; ++i) {
+        error_t *err = bytes_append_uint8(src, i + 1);
+        munit_assert_null(err);
+    }
+    munit_assert_size(src->len, ==, 5);
+
+    // Append source to destination
+    error_t *err = bytes_append_bytes(dst, src);
+    munit_assert_null(err);
+    munit_assert_size(dst->len, ==, src->len);
+
+    // Verify destination contents match source
+    for (size_t i = 0; i < src->len; ++i) {
+        munit_assert_uint8(dst->buffer[i], ==, src->buffer[i]);
+    }
+
+    // Fill source with more data and append again
+    for (uint8_t i = 0; i < 3; ++i) {
+        err = bytes_append_uint8(src, i + 6);
+        munit_assert_null(err);
+    }
+    munit_assert_size(src->len, ==, 8);
+
+    // Append updated source
+    err = bytes_append_bytes(dst, src);
+    munit_assert_null(err);
+    munit_assert_size(dst->len, ==, 13); // 5 + 8
+
+    // Test capacity boundary
+    src->len = 4; // manually set length to barely not fit
+    err = bytes_append_bytes(dst, src);
+    munit_assert_ptr(err, ==, err_bytes_no_capacity);
+    munit_assert_size(dst->len, ==, 13); // Length unchanged after error
+
+    return MUNIT_OK;
+}
+MunitResult test_bytes_append_uint16(const MunitParameter params[], void *data) {
+    bytes_t *bytes = LOCAL_BYTES(16);
+    munit_assert_size(bytes->len, ==, 0);
+    munit_assert_size(bytes->cap, ==, 16);
+
+    bytes_append_uint16(bytes, 0xFFAA);
+    munit_assert_size(bytes->len, ==, 2);
+    munit_assert_uint8(bytes->buffer[0], ==, 0xAA);
+    munit_assert_uint8(bytes->buffer[1], ==, 0xFF);
+
+    return MUNIT_OK;
+}
+MunitResult test_bytes_append_uint32(const MunitParameter params[], void *data) {
+    bytes_t *bytes = LOCAL_BYTES(16);
+    munit_assert_size(bytes->len, ==, 0);
+    munit_assert_size(bytes->cap, ==, 16);
+
+    bytes_append_uint32(bytes, 0xAABBCCDD);
+    munit_assert_size(bytes->len, ==, 4);
+    munit_assert_uint8(bytes->buffer[0], ==, 0xDD);
+    munit_assert_uint8(bytes->buffer[1], ==, 0xCC);
+    munit_assert_uint8(bytes->buffer[2], ==, 0xBB);
+    munit_assert_uint8(bytes->buffer[3], ==, 0xAA);
+    return MUNIT_OK;
+}
+MunitResult test_bytes_append_uint64(const MunitParameter params[], void *data) {
+    bytes_t *bytes = LOCAL_BYTES(16);
+    munit_assert_size(bytes->len, ==, 0);
+    munit_assert_size(bytes->cap, ==, 16);
+
+    bytes_append_uint64(bytes, 0xAABBCCDDEEFF9988);
+    munit_assert_size(bytes->len, ==, 8);
+    munit_assert_uint8(bytes->buffer[0], ==, 0x88);
+    munit_assert_uint8(bytes->buffer[1], ==, 0x99);
+    munit_assert_uint8(bytes->buffer[2], ==, 0xFF);
+    munit_assert_uint8(bytes->buffer[3], ==, 0xEE);
+    munit_assert_uint8(bytes->buffer[4], ==, 0xDD);
+    munit_assert_uint8(bytes->buffer[5], ==, 0xCC);
+    munit_assert_uint8(bytes->buffer[6], ==, 0xBB);
+    munit_assert_uint8(bytes->buffer[7], ==, 0xAA);
+    return MUNIT_OK;
+}
+
+MunitTest bytes_tests[] = {
+    {"/initializer",   test_bytes_initializer,   nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr},
+    {"/append_uint8",  test_bytes_append_uint8,  nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr},
+    {"/append_array",  test_bytes_append_array,  nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr},
+    {"/append_bytes",  test_bytes_append_bytes,  nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr},
+    {"/append_uint16", test_bytes_append_uint16, nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr},
+    {"/append_uint32", test_bytes_append_uint32, nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr},
+    {"/append_uint64", test_bytes_append_uint64, nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr},
+    {nullptr,          nullptr,                  nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr}
+};
--- a/tests/input/regression/test_no_operands_eof.asm
+++ b/tests/input/regression/test_no_operands_eof.asm
@ -0,0 +1,5 @@
+; regression test for two issues:
+;  - parsing two zero operand instructions in a row
+;  - a zero operand instruction just before eof
+    syscall
+    ret
--- a/tests/main.c
+++ b/tests/main.c
@ -4,6 +4,7 @@ extern MunitTest ast_tests[];
 extern MunitTest lexer_tests[];
 extern MunitTest regression_tests[];
 extern MunitTest symbols_tests[];
+extern MunitTest bytes_tests[];

 int main(int argc, char *argv[MUNIT_ARRAY_PARAM(argc + 1)]) {
    MunitSuite suites[] = {
@ -11,6 +12,7 @@ int main(int argc, char *argv[MUNIT_ARRAY_PARAM(argc + 1)]) {
        {"/ast",        ast_tests,        nullptr, 1, MUNIT_SUITE_OPTION_NONE},
        {"/lexer",      lexer_tests,      nullptr, 1, MUNIT_SUITE_OPTION_NONE},
        {"/symbols",    symbols_tests,    nullptr, 1, MUNIT_SUITE_OPTION_NONE},
+        {"/bytes",      bytes_tests,      nullptr, 1, MUNIT_SUITE_OPTION_NONE},
        {nullptr,       nullptr,          nullptr, 0, MUNIT_SUITE_OPTION_NONE},
    };

--- a/tests/regression.c
+++ b/tests/regression.c
@ -23,9 +23,46 @@ MunitResult test_regression_trivia_head(const MunitParameter params[], void *dat

    ast_node_free(result.node);
    tokenlist_free(list);
+    return MUNIT_OK;
+}
+
+MunitResult test_no_operands_eof(const MunitParameter params[], void *data) {
+    (void)params;
+    (void)data;
+
+    lexer_t *lex = &(lexer_t){};
+    error_t *err = lexer_open(lex, "tests/input/regression/test_no_operands_eof.asm");
+    munit_assert_null(err);
+
+    tokenlist_t *list;
+    err = tokenlist_alloc(&list);
+    munit_assert_null(err);
+
+    err = tokenlist_fill(list, lex);
+    munit_assert_null(err);
+
+    parse_result_t result = parse(list->head);
+    munit_assert_null(result.err);
+    munit_assert_null(result.next);
+
+    // Both children should be instructions
+    munit_assert_size(result.node->len, ==, 2);
+    munit_assert_int(result.node->children[0]->id, ==, NODE_INSTRUCTION);
+    munit_assert_int(result.node->children[1]->id, ==, NODE_INSTRUCTION);
+
+    // And they should have empty operands
+    munit_assert_size(result.node->children[0]->len, ==, 2);
+    munit_assert_size(result.node->children[1]->len, ==, 2);
+    munit_assert_size(result.node->children[0]->children[1]->len, ==, 0);
+    munit_assert_size(result.node->children[1]->children[1]->len, ==, 0);
+
+    ast_node_free(result.node);
+    tokenlist_free(list);
+    return MUNIT_OK;
 }

 MunitTest regression_tests[] = {
-    {"/trivia_head", test_regression_trivia_head, nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr},
-    {nullptr,        nullptr,                     nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr}
+    {"/trivia_head",     test_regression_trivia_head, nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr},
+    {"/no_operands_eof", test_no_operands_eof,        nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr},
+    {nullptr,            nullptr,                     nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr}
 };
Author	SHA1	Message	Date
omicron	d97cfb97be	Implement printing the encoding in main All checks were successful Validate the build / validate-build (push) Successful in 33s Details	2025-04-16 23:10:17 +02:00
omicron	99c9dcd985	Incomplete second pass encoding	2025-04-16 23:10:09 +02:00
omicron	7e9c1bfda2	Add bytes type and tests bytes_t is a local (automatic) allocation array that carries the length and capacity with it.	2025-04-16 23:10:09 +02:00
omicron	d8ae126e9a	Add opcode encoding value for NODE_INSTRUCTION entries in the AST	2025-04-16 23:10:09 +02:00
omicron	68dcd9dcce	Add first encoding pass First pass collects all the symbols and interprets number and register tokens into usable data for the later passes.	2025-04-16 23:10:00 +02:00
omicron	dcf90b72e0	Add register and number values to AST nodes	2025-04-16 23:10:00 +02:00
omicron	2cf69f5e18	Add initial limited opcode data	2025-04-16 23:09:47 +02:00
omicron	d59559d327	Add registers data table Change the validated primitive parse_register so that it uses the data table instead	2025-04-16 13:46:19 +02:00
omicron	ac14925a0a	Add symbols tests	2025-04-16 13:46:19 +02:00
omicron	2a7bb479ac	initial symbol table implementation	2025-04-16 13:46:19 +02:00
omicron	ef22c0b620	Add .import and .export to the input test file	2025-04-16 13:46:19 +02:00
omicron	8c0e9926c5	Make main properly return with failure on parsing errors	2025-04-16 13:46:19 +02:00
omicron	d3d69b82d5	Add .import and .export directive to the grammar and parser	2025-04-16 13:46:10 +02:00
omicron	dc210e409c	fix parse_immediate to accept label_reference instead of identifier	2025-04-16 13:41:28 +02:00
omicron	00272d69bf	Add regression test for parse zero operands at eof All checks were successful Validate the build / validate-build (push) Successful in 30s Details	2025-04-16 13:16:55 +02:00
omicron	2385d38608	Prune the parse tree of NODE_NEWLINE after parsing succeeds	2025-04-16 13:01:02 +02:00
omicron	242fd9baa5	Fix grammar not being able to disambiguate some instructions When two identifiers follow eachother it could be two instruction mnemonics or one instruction mnemonic and one operand. To fix this TOKEN_NEWLINE has been reintroduced as a semantic token. The grammar has been changed to allow empty statements and every instruction and directive has to end in a newline. Labels do not have to end in a newline. In addition to updating the grammar, the implementation of tokenlist, ast and parser has been updated to reflect these changes.	2025-04-16 12:34:44 +02:00
omicron	1574ec6249	Fix parse_consecutive behavior when the token stream runs out	2025-04-16 12:13:02 +02:00