From 99c9dcd98578c98b94028f40a10a41fc3ec0828f Mon Sep 17 00:00:00 2001 From: omicron Date: Wed, 16 Apr 2025 00:21:41 +0200 Subject: [PATCH] Incomplete second pass encoding --- src/encoder/encoder.c | 317 +++++++++++++++++++++++++++++++++++++++++- src/encoder/encoder.h | 12 ++ 2 files changed, 328 insertions(+), 1 deletion(-) diff --git a/src/encoder/encoder.c b/src/encoder/encoder.c index a43a23b..d3b999a 100644 --- a/src/encoder/encoder.c +++ b/src/encoder/encoder.c @@ -1,4 +1,5 @@ #include "encoder.h" +#include "../bytes.h" #include "../data/opcodes.h" #include "symbols.h" #include @@ -15,6 +16,12 @@ error_t *const err_encoder_invalid_size_suffix = &(error_t){.message = "Invalid number size suffix"}; error_t *const err_encoder_unknown_symbol_reference = &(error_t){.message = "Referenced an unknown symbol"}; +error_t *const err_encoder_no_encoding_found = + &(error_t){.message = "No encoding found for instruction"}; +error_t *const err_encoder_not_implemented = + &(error_t){.message = "Implementation for this opcode is missing"}; +error_t *const err_encoder_unexpected_length = + &(error_t){.message = "Unexpectedly long encoding"}; error_t *encoder_alloc(encoder_t **output) { *output = nullptr; @@ -162,6 +169,50 @@ error_t *encoder_set_register_value(ast_node_t *node) { return err_encoder_invalid_register; } +/** + * Set the opcode extension in the modrm field + */ +static inline uint8_t modrm_extension(uint8_t modrm, uint8_t extension) { + assert(extension != opcode_extension_none); + assert((extension & 0b111) == extension); + return (modrm & ~modrm_reg_mask) | extension << 3; +} + +/** + * Return the rex bit for reg field in modrm + */ +static inline uint8_t modrm_reg_rex(uint8_t rex, register_id_t id) { + if (id & 0b1000) + rex |= rex_prefix_r; + return rex; +} + +/** + * update modrm reg field with the given register, must be used alongside + * modrm_reg_rex + */ +static inline uint8_t modrm_reg(uint8_t modrm, register_id_t id) { + return (modrm & ~modrm_reg_mask) | (id & 0b111) << 3; +} + +/** + * Return the rex bit for rm field in modrm + */ +static inline uint8_t modrm_rm_rex(uint8_t rex, register_id_t id) { + if (id & 0b1000) + rex |= rex_prefix_b; + return rex; +} + +/** + * update modrm rm field with the given register, must be used alongside + * modrm_rm_rex + */ +static inline uint8_t modrm_rm(uint8_t modrm, register_id_t id) { + assert((modrm & modrm_mod_mask) == modrm_mod_register); + return (modrm & ~modrm_rm_mask) | (id & 0b111); +} + /** * Perform the initial pass over the AST. Records all symbols and sets the * values of registers and numbers. @@ -187,8 +238,272 @@ error_t *encoder_first_pass(encoder_t *encoder, ast_node_t *node) { return nullptr; } +bool is_operand_match(operand_info_t *info, ast_node_t *operand) { + switch (info->kind) { + case OPERAND_REGISTER: + return operand->id == NODE_REGISTER && + operand->value.reg.size == info->size; + case OPERAND_MEMORY: + return operand->id == NODE_MEMORY; + case OPERAND_IMMEDIATE: { + if (operand->id != NODE_IMMEDIATE) + return false; + ast_node_t *child = operand->children[0]; + + if (child->id == NODE_NUMBER) + return (child->value.number.size & info->size) > 0; + else if (child->id == NODE_LABEL_REFERENCE) + return info->size == OPERAND_SIZE_32; + // FIXME: first pass should give us information about the distance of + // the label reference so we can pick a size more appropriately instead + // of just defaulting to 32 bits + break; + } // end OPERAND_IMMEDIATE case + } + assert(false && "unreachable"); + __builtin_unreachable(); +} + +bool is_opcode_match(opcode_data_t *opcode, const char *mnemonic, + ast_node_t *operands) { + if (strcmp(opcode->mnemonic, mnemonic) != 0) + return false; + + if (opcode->operand_count != operands->len) + return false; + + for (size_t i = 0; i < operands->len; ++i) { + if (!is_operand_match(&opcode->operands[i], operands->children[i])) + return false; + } + + return true; +} + +error_t *encoder_get_opcode_data(ast_node_t *instruction, ast_node_t *operands, + opcode_data_t **opcode_out) { + const char *mnemonic = instruction->children[0]->token_entry->token.value; + + for (size_t i = 0; opcodes[i]; ++i) { + opcode_data_t *opcode = opcodes[i]; + if (is_opcode_match(opcode, mnemonic, operands)) { + *opcode_out = opcode; + return nullptr; + } + } + return err_encoder_no_encoding_found; +} + +error_t *encode_two_operand(encoder_t *encoder, opcode_data_t *opcode, + ast_node_t *operands, bytes_t *encoding, + uint8_t *rex) { + (void)encoder; + (void)opcode; + (void)operands; + (void)encoding; + (void)rex; + assert(encoding->len >= 1 && "must have 1+ opcode byte in buffer already"); + return err_encoder_not_implemented; +} + +error_t *encode_one_register_in_opcode(encoder_t *encoder, + opcode_data_t *opcode, + ast_node_t *operands, bytes_t *encoding, + uint8_t *rex) { + (void)encoder; + (void)opcode; + + register_id_t id = operands->children[0]->value.reg.id; + encoding->buffer[encoding->len - 1] |= id & 0b111; + if ((id & 0b1000) > 0) { + *rex |= rex_prefix_r; + } + return nullptr; +} + +error_t *encode_one_register(encoder_t *encoder, opcode_data_t *opcode, + ast_node_t *operands, bytes_t *encoding, + uint8_t *rex) { + (void)encoder; + assert(operands->len == 1); + assert(operands->children[0]->id == NODE_REGISTER); + + register_id_t id = operands->children[0]->value.reg.id; + + uint8_t modrm = modrm_mod_register; + + if (opcode->opcode_extension != opcode_extension_none) { + // register goes in rm field, extension goes in mod field + modrm = modrm_extension(modrm, opcode->opcode_extension); + modrm = modrm_rm(modrm, id); + *rex = modrm_rm_rex(*rex, id); + } else { + // register goes in reg field + // NOTE: + // it's actually likely this case just doesn't exist at all and all + // opcodes that take one register in modr/m _all_ have extended opcdes + modrm = modrm_reg(modrm, id); + *rex = modrm_reg_rex(*rex, id); + } + + return bytes_append_uint8(encoding, modrm); +} + +error_t *encode_one_immediate(encoder_t *encoder, opcode_data_t *opcode, + ast_node_t *operands, bytes_t *encoding, + uint8_t *rex) { + (void)encoder; + (void)opcode; + (void)rex; + assert(operands->len == 1); + assert(operands->children[0]->id == NODE_IMMEDIATE); + assert(operands->children[0]->len == 1); + ast_node_t *immediate = operands->children[0]->children[0]; + assert(immediate->id == NODE_NUMBER || + immediate->id == NODE_LABEL_REFERENCE); + + if (immediate->id == NODE_NUMBER) { + uint64_t value = immediate->value.number.value; + operand_size_t size = opcode->operands[0].size; + error_t *err = nullptr; + switch (size) { + case OPERAND_SIZE_8: + err = bytes_append_uint8(encoding, value); + break; + case OPERAND_SIZE_16: + err = bytes_append_uint16(encoding, value); + break; + case OPERAND_SIZE_32: + err = bytes_append_uint32(encoding, value); + break; + case OPERAND_SIZE_64: + err = bytes_append_uint64(encoding, value); + break; + default: + assert(false && "intentionally unhandled"); + } + return err; + } else { + // FIXME: this still assumes references are always 32 bit + uint32_t value = 0xDEADBEEF; + return bytes_append_uint32(encoding, value); + } +} + +error_t *encode_one_memory(encoder_t *encoder, opcode_data_t *opcode, + ast_node_t *operands, bytes_t *encoding, + uint8_t *rex) { + (void)encoder; + (void)opcode; + (void)operands; + (void)encoding; + (void)rex; + return err_encoder_not_implemented; +} + +error_t *encode_one_operand(encoder_t *encoder, opcode_data_t *opcode, + ast_node_t *operands, bytes_t *encoding, + uint8_t *rex) { + switch (opcode->operands[0].kind) { + case OPERAND_REGISTER: + if (opcode->encoding_class == ENCODING_OPCODE_REGISTER) + return encode_one_register_in_opcode(encoder, opcode, operands, + encoding, rex); + else + return encode_one_register(encoder, opcode, operands, encoding, + rex); + case OPERAND_MEMORY: + return encode_one_memory(encoder, opcode, operands, encoding, rex); + case OPERAND_IMMEDIATE: + return encode_one_immediate(encoder, opcode, operands, encoding, rex); + } +} + +error_t *encoder_encode_instruction(encoder_t *encoder, + ast_node_t *instruction) { + ast_node_t *operands = instruction->children[1]; + + opcode_data_t *opcode = nullptr; + error_t *err = encoder_get_opcode_data(instruction, operands, &opcode); + if (err) + return err; + + uint8_t rex = 0; + bytes_t *encoding = LOCAL_BYTES(32); + + if (opcode->opcode > 0xFF && + (err = bytes_append_uint8(encoding, opcode->opcode >> 8))) + return err; + if ((err = bytes_append_uint8(encoding, opcode->opcode & 0xFF))) + return err; + + // NOTE:operand encoders all expect the opcode to be in the buffer already. + // Some of them rely on this to encode the register value in the opcode + // byte. + switch (opcode->operand_count) { + case 0: + break; + case 1: + err = encode_one_operand(encoder, opcode, operands, encoding, &rex); + break; + case 2: + err = encode_two_operand(encoder, opcode, operands, encoding, &rex); + break; + default: + err = err_encoder_not_implemented; + } + if (err) + return err; + + // produce the actual encoding output in the NODE_INSTRUCTION value + uint8_t *output = instruction->value.encoding.encoding; + size_t output_len = 0; + + // Handle prefixes + if (opcode->rex_w_prefix) + rex = rex_prefix_w; + if (opcode->address_size_prefix) + output[output_len++] = memory_size_prefix; + if (opcode->operand_size_prefix) + output[output_len++] = operand_size_prefix; + if (rex > 0) + output[output_len++] = rex; + + // copy the encoded opcode and operands + if (encoding->len > 20) + return err_encoder_unexpected_length; + memcpy(output + output_len, encoding->buffer, encoding->len); + output_len += encoding->len; + + instruction->value.encoding.len = output_len; + + return nullptr; +} + +/** + * Perform the second pass that performs actual encoding. Will use + * placeholder values for label references because instruction size has not + * yet been determined. + */ +error_t *encoder_encoding_pass(encoder_t *encoder, ast_node_t *root) { + for (size_t i = 0; i < root->len; ++i) { + if (root->children[i]->id != NODE_INSTRUCTION) + continue; + ast_node_t *instruction = root->children[i]; + error_t *err = encoder_encode_instruction(encoder, instruction); + if (err) + return err; + } + return nullptr; +} + opcode_data_t *encoder_find_opcode(ast_node_t *instruction) { for (size_t i = 0; opcodes[i] != nullptr; ++i) { + const char *mnemonic = + instruction->children[0]->token_entry->token.value; + ast_node_t *operands = instruction->children[1]; + if (is_opcode_match(opcodes[i], mnemonic, operands)) + return opcodes[i]; } return nullptr; } @@ -207,5 +522,5 @@ error_t *encoder_encode(encoder_t *encoder, ast_node_t *ast) { err = encoder_check_symbols(encoder); if (err) return err; - return nullptr; + return encoder_encoding_pass(encoder, ast); } diff --git a/src/encoder/encoder.h b/src/encoder/encoder.h index f10966a..45d34d7 100644 --- a/src/encoder/encoder.h +++ b/src/encoder/encoder.h @@ -7,6 +7,15 @@ typedef struct encoder { symbol_table_t *symbols; } encoder_t; +constexpr uint8_t modrm_mod_memory = 0b00'000'000; +constexpr uint8_t modrm_mod_memory_displacement8 = 0b01'000'000; +constexpr uint8_t modrm_mod_memory_displacement32 = 0b10'000'000; +constexpr uint8_t modrm_mod_register = 0b11'000'000; + +constexpr uint8_t modrm_reg_mask = 0b00'111'000; +constexpr uint8_t modrm_rm_mask = 0b00'000'111; +constexpr uint8_t modrm_mod_mask = 0b11'000'000; + error_t *encoder_alloc(encoder_t **encoder); error_t *encoder_encode(encoder_t *encoder, ast_node_t *ast); void encoder_free(encoder_t *encoder); @@ -16,5 +25,8 @@ extern error_t *const err_encoder_number_overflow; extern error_t *const err_encoder_invalid_number_format; extern error_t *const err_encoder_invalid_size_suffix; extern error_t *const err_encoder_unknown_symbol_reference; +extern error_t *const err_encoder_no_encoding_found; +extern error_t *const err_encoder_not_implemented; +extern error_t *const err_encoder_unexpected_length; #endif // INCLUDE_ENCODER_ENCODER_H_