From fab5bedf3dc359f8a3f94f3e87683f7183caad59 Mon Sep 17 00:00:00 2001 From: omicron Date: Tue, 22 Apr 2025 02:08:38 +0200 Subject: [PATCH] Implement two pass encoding First pass: - collect information for numbers, registers and which instructions contain label references - encode all instructions that don't contain label references - Set (temporary) addresses for each instruction Second pass: - Collect information about label references (address, offset, size) - encode all instructions that contain label references - Update (if necessary) addresses for each instruction The second pass is iterated 10 times or until no instructions change size, whichever comes first. --- src/encoder/encoder.c | 191 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 155 insertions(+), 36 deletions(-) diff --git a/src/encoder/encoder.c b/src/encoder/encoder.c index d81a084..544f711 100644 --- a/src/encoder/encoder.c +++ b/src/encoder/encoder.c @@ -6,6 +6,31 @@ #include #include +/** + * General encoder flow: + * + * There are 2 major passes the encoder does: + * + * First pass: + * - Run through the AST and collect information: + * - Set register values + * - Parse/set number values + * - Mark all instructions that use label references + * - Encode all instructions that don't use label references + * - Update addresses of all labels and instructions. Use an estimated + * instruction size for those instructions that use label references. + * + * Second pass: + * - Run through the AST for all instructions that use label references and + * collect size information using the estimated addresses from pass 1 + * - Encode label references with the estimated addresses, this fixes their + * size. + * - Update all addresses + * + * Iteration: + * - Repeat the second pass until addresses converge + */ + error_t *const err_encoder_invalid_register = &(error_t){.message = "Invalid register"}; error_t *const err_encoder_number_overflow = @@ -219,9 +244,11 @@ error_t *encoder_collect_info(encoder_t *encoder, ast_node_t *node, ast_node_t *statement) { error_t *err = nullptr; - if (encoder_is_symbols_node(node)) + if (encoder_is_symbols_node(node)) { err = symbol_table_update(encoder->symbols, node, statement); - else if (node->id == NODE_NUMBER) + if (statement->id == NODE_INSTRUCTION) + statement->value.instruction.has_reference = true; + } else if (node->id == NODE_NUMBER) err = encoder_set_number_value(node); else if (node->id == NODE_REGISTER) err = encoder_set_register_value(node); @@ -238,31 +265,6 @@ error_t *encoder_collect_info(encoder_t *encoder, ast_node_t *node, return nullptr; } -/** - * Perform the initial pass over the AST. - * - * - Collect information about the operands - * - parse and set number values - * - set the register values - * - determine if label references are used by an instruction - * - encode instructions that don't use label references - * - determine estimated addresses of each statement - * - */ -error_t *encoder_first_pass(encoder_t *encoder) { - ast_node_t *root = encoder->ast; - assert(root->id == NODE_PROGRAM); - - for (size_t i = 0; i < root->len; ++i) { - ast_node_t *statement = root->children[i]; - error_t *err = encoder_collect_info(encoder, statement, statement); - if (err) - return err; - } - - return nullptr; -} - bool is_operand_match(operand_info_t *info, ast_node_t *operand) { switch (info->kind) { case OPERAND_REGISTER: @@ -506,20 +508,130 @@ error_t *encoder_encode_instruction(encoder_t *encoder, } /** - * Perform the second pass that performs actual encoding. Will use - * placeholder values for label references because instruction size has not - * yet been determined. + * Perform the initial pass over the AST. + * + * - Collect information about the operands + * - parse and set number values + * - set the register values + * - determine if label references are used by an instruction + * - encode instructions that don't use label references + * - determine estimated addresses of each statement + * */ -error_t *encoder_second_pass(encoder_t *encoder) { +constexpr size_t instruction_size_estimate = 10; +error_t *encoder_first_pass(encoder_t *encoder) { ast_node_t *root = encoder->ast; + assert(root->id == NODE_PROGRAM); + + uintptr_t address = 0; for (size_t i = 0; i < root->len; ++i) { - if (root->children[i]->id != NODE_INSTRUCTION) - continue; - ast_node_t *instruction = root->children[i]; - error_t *err = encoder_encode_instruction(encoder, instruction); + ast_node_t *statement = root->children[i]; + error_t *err = encoder_collect_info(encoder, statement, statement); if (err) return err; + + if (statement->id == NODE_INSTRUCTION && + statement->value.instruction.has_reference == false) { + err = encoder_encode_instruction(encoder, statement); + if (err) + return err; + statement->value.instruction.address = address; + address += statement->value.instruction.encoding.len; + } else if (statement->id == NODE_INSTRUCTION) { + statement->value.instruction.encoding.len = + instruction_size_estimate; + statement->value.instruction.address = address; + address += instruction_size_estimate; + } else if (statement->id == NODE_LABEL) { + statement->value.instruction.address = address; + } + } + + return nullptr; +} + +operand_size_t signed_to_size_mask(int64_t value) { + operand_size_t size = OPERAND_SIZE_64; + + if (value >= INT8_MIN && value <= INT8_MAX) + size |= OPERAND_SIZE_8; + + if (value >= INT16_MIN && value <= INT16_MAX) + size |= OPERAND_SIZE_16; + + if (value >= INT32_MIN && value <= INT32_MAX) + size |= OPERAND_SIZE_32; + + return size; +} + +int64_t statement_offset(ast_node_t *from, ast_node_t *to) { + assert(from->id == NODE_LABEL || from->id == NODE_INSTRUCTION); + assert(to->id == NODE_LABEL || to->id == NODE_INSTRUCTION); + + int64_t from_addr = + from->value.instruction.address + from->value.instruction.encoding.len; + int64_t to_addr = to->value.instruction.address; + + return to_addr - from_addr; +} + +error_t *encoder_collect_label_info(encoder_t *encoder, ast_node_t *node, + ast_node_t *statement) { + assert(statement->id == NODE_INSTRUCTION); + if (node->id == NODE_LABEL_REFERENCE) { + const char *name = node->token_entry->token.value; + symbol_t *symbol = symbol_table_lookup(encoder->symbols, name); + assert(symbol && symbol->statement && + symbol->statement->id == NODE_LABEL); + + int64_t offset = statement_offset(statement, symbol->statement); + int64_t absolute = symbol->statement->value.instruction.address; + operand_size_t size = signed_to_size_mask(offset); + + node->value.reference.address = absolute; + node->value.reference.offset = offset; + node->value.reference.size = size; + } + + return nullptr; +} + +/** + * Perform the second pass. Updates the label info and encodes all instructions + * that have a label reference.that performs actual encoding. + */ +error_t *encoder_second_pass(encoder_t *encoder, bool *did_update) { + ast_node_t *root = encoder->ast; + + *did_update = false; + int64_t address = 0; + for (size_t i = 0; i < root->len; ++i) { + ast_node_t *statement = root->children[i]; + + if (statement->id == NODE_INSTRUCTION && + statement->value.instruction.has_reference) { + statement->value.instruction.address = address; + size_t before = statement->value.instruction.encoding.len; + error_t *err = + encoder_collect_label_info(encoder, statement, statement); + if (err) + return err; + err = encoder_encode_instruction(encoder, statement); + if (err) + return err; + size_t after = statement->value.instruction.encoding.len; + address += after; + *did_update = *did_update || (before != after); + } else if (statement->id == NODE_INSTRUCTION && + statement->value.instruction.has_reference) { + statement->value.instruction.address = address; + address += statement->value.instruction.encoding.len; + } else if (statement->id == NODE_LABEL) { + statement->value.label.address = address; + } + address += statement->value.instruction.encoding.len; } return nullptr; } @@ -549,5 +661,12 @@ error_t *encoder_encode(encoder_t *encoder) { err = encoder_check_symbols(encoder); if (err) return err; - return encoder_second_pass(encoder); + + bool did_update = true; + for (int i = 0; i < 10 && did_update; ++i) { + err = encoder_second_pass(encoder, &did_update); + if (err) + return err; + } + return nullptr; }