Compare commits

...

2 Commits

Author SHA1 Message Date
7cefc3564d Implement one immediate label reference operand
All checks were successful
Validate the build / validate-build (push) Successful in 43s
Also adds opcode data for jmp and call
2025-04-24 14:45:57 +02:00
c848995ad6 Implement two pass encoding
First pass:
 - collect information for numbers, registers and which instructions
   contain label references
 - encode all instructions that don't contain label references
 - Set (temporary) addresses for each instruction

Second pass:
 - Collect information about label references (address, offset, size)
 - encode all instructions that contain label references
 - Update (if necessary) addresses for each instruction

 The second pass is iterated 10 times or until no instructions change
 size, whichever comes first.
2025-04-24 14:45:46 +02:00
2 changed files with 331 additions and 53 deletions

View File

@ -138,8 +138,128 @@ opcode_data_t *const opcodes[] = {
{ .kind = OPERAND_REGISTER, .size = OPERAND_SIZE_64 }, { .kind = OPERAND_REGISTER, .size = OPERAND_SIZE_64 },
}, },
}, },
// CALL rel32
&(opcode_data_t) {
.mnemonic = "call",
.opcode = 0xE8,
.opcode_extension = opcode_extension_none,
.encoding_class = ENCODING_DEFAULT,
.operand_count = 1,
.operands = {
{ .kind = OPERAND_IMMEDIATE, .size = OPERAND_SIZE_32 },
},
},
// CALL reg64
&(opcode_data_t) {
.mnemonic = "call",
.opcode = 0xFF,
.opcode_extension = 2,
.encoding_class = ENCODING_DEFAULT,
.rex_w_prefix = true,
.operand_count = 1,
.operands = {
{ .kind = OPERAND_REGISTER, .size = OPERAND_SIZE_64 },
},
},
// CALL mem64
&(opcode_data_t) {
.mnemonic = "call",
.opcode = 0xFF,
.opcode_extension = 2,
.encoding_class = ENCODING_DEFAULT,
.rex_w_prefix = true,
.operand_count = 1,
.operands = {
{ .kind = OPERAND_MEMORY, .size = OPERAND_SIZE_64 },
},
},
// JMP rel8 (short jump)
&(opcode_data_t) {
.mnemonic = "jmp",
.opcode = 0xEB,
.opcode_extension = opcode_extension_none,
.encoding_class = ENCODING_DEFAULT,
.operand_count = 1,
.operands = {
{ .kind = OPERAND_IMMEDIATE, .size = OPERAND_SIZE_8 },
},
},
// JMP rel16
&(opcode_data_t) {
.mnemonic = "jmp",
.opcode = 0xE9,
.opcode_extension = opcode_extension_none,
.encoding_class = ENCODING_DEFAULT,
.operand_size_prefix = true,
.operand_count = 1,
.operands = {
{ .kind = OPERAND_IMMEDIATE, .size = OPERAND_SIZE_16 },
},
},
// JMP reg16
&(opcode_data_t) {
.mnemonic = "jmp",
.opcode = 0xFF,
.opcode_extension = 4,
.encoding_class = ENCODING_DEFAULT,
.operand_size_prefix = true,
.operand_count = 1,
.operands = {
{ .kind = OPERAND_REGISTER, .size = OPERAND_SIZE_16 },
},
},
// JMP rel32 (near jump)
&(opcode_data_t) {
.mnemonic = "jmp",
.opcode = 0xE9,
.opcode_extension = opcode_extension_none,
.encoding_class = ENCODING_DEFAULT,
.operand_count = 1,
.operands = {
{ .kind = OPERAND_IMMEDIATE, .size = OPERAND_SIZE_32 },
},
},
// JMP reg32
&(opcode_data_t) {
.mnemonic = "jmp",
.opcode = 0xFF,
.opcode_extension = 4,
.encoding_class = ENCODING_DEFAULT,
.operand_count = 1,
.operands = {
{ .kind = OPERAND_REGISTER, .size = OPERAND_SIZE_32 },
},
},
// JMP reg64
&(opcode_data_t) {
.mnemonic = "jmp",
.opcode = 0xFF,
.opcode_extension = 4,
.encoding_class = ENCODING_DEFAULT,
.rex_w_prefix = true,
.operand_count = 1,
.operands = {
{ .kind = OPERAND_REGISTER, .size = OPERAND_SIZE_64 },
},
},
// JMP mem64
&(opcode_data_t) {
.mnemonic = "jmp",
.opcode = 0xFF,
.opcode_extension = 4,
.encoding_class = ENCODING_DEFAULT,
.rex_w_prefix = true,
.operand_count = 1,
.operands = {
{ .kind = OPERAND_MEMORY, .size = OPERAND_SIZE_64 },
},
},
nullptr, nullptr,
}; };

View File

@ -6,6 +6,31 @@
#include <errno.h> #include <errno.h>
#include <string.h> #include <string.h>
/**
* General encoder flow:
*
* There are 2 major passes the encoder does:
*
* First pass:
* - Run through the AST and collect information:
* - Set register values
* - Parse/set number values
* - Mark all instructions that use label references
* - Encode all instructions that don't use label references
* - Update addresses of all labels and instructions. Use an estimated
* instruction size for those instructions that use label references.
*
* Second pass:
* - Run through the AST for all instructions that use label references and
* collect size information using the estimated addresses from pass 1
* - Encode label references with the estimated addresses, this fixes their
* size.
* - Update all addresses
*
* Iteration:
* - Repeat the second pass until addresses converge
*/
error_t *const err_encoder_invalid_register = error_t *const err_encoder_invalid_register =
&(error_t){.message = "Invalid register"}; &(error_t){.message = "Invalid register"};
error_t *const err_encoder_number_overflow = error_t *const err_encoder_number_overflow =
@ -219,9 +244,11 @@ error_t *encoder_collect_info(encoder_t *encoder, ast_node_t *node,
ast_node_t *statement) { ast_node_t *statement) {
error_t *err = nullptr; error_t *err = nullptr;
if (encoder_is_symbols_node(node)) if (encoder_is_symbols_node(node)) {
err = symbol_table_update(encoder->symbols, node, statement); err = symbol_table_update(encoder->symbols, node, statement);
else if (node->id == NODE_NUMBER) if (statement->id == NODE_INSTRUCTION)
statement->value.instruction.has_reference = true;
} else if (node->id == NODE_NUMBER)
err = encoder_set_number_value(node); err = encoder_set_number_value(node);
else if (node->id == NODE_REGISTER) else if (node->id == NODE_REGISTER)
err = encoder_set_register_value(node); err = encoder_set_register_value(node);
@ -238,36 +265,11 @@ error_t *encoder_collect_info(encoder_t *encoder, ast_node_t *node,
return nullptr; return nullptr;
} }
/**
* Perform the initial pass over the AST.
*
* - Collect information about the operands
* - parse and set number values
* - set the register values
* - determine if label references are used by an instruction
* - encode instructions that don't use label references
* - determine estimated addresses of each statement
*
*/
error_t *encoder_first_pass(encoder_t *encoder) {
ast_node_t *root = encoder->ast;
assert(root->id == NODE_PROGRAM);
for (size_t i = 0; i < root->len; ++i) {
ast_node_t *statement = root->children[i];
error_t *err = encoder_collect_info(encoder, statement, statement);
if (err)
return err;
}
return nullptr;
}
bool is_operand_match(operand_info_t *info, ast_node_t *operand) { bool is_operand_match(operand_info_t *info, ast_node_t *operand) {
switch (info->kind) { switch (info->kind) {
case OPERAND_REGISTER: case OPERAND_REGISTER:
return operand->id == NODE_REGISTER && return operand->id == NODE_REGISTER &&
operand->value.reg.size == info->size; ast_node_register_value(operand)->size == info->size;
case OPERAND_MEMORY: case OPERAND_MEMORY:
return operand->id == NODE_MEMORY; return operand->id == NODE_MEMORY;
case OPERAND_IMMEDIATE: { case OPERAND_IMMEDIATE: {
@ -276,13 +278,10 @@ bool is_operand_match(operand_info_t *info, ast_node_t *operand) {
ast_node_t *child = operand->children[0]; ast_node_t *child = operand->children[0];
if (child->id == NODE_NUMBER) if (child->id == NODE_NUMBER)
return (child->value.number.size & info->size) > 0; return (ast_node_number_value(child)->size & info->size) > 0;
else if (child->id == NODE_LABEL_REFERENCE) else if (child->id == NODE_LABEL_REFERENCE) {
return info->size == OPERAND_SIZE_32; return info->size &= ast_node_reference_value(child)->size;
// FIXME: first pass should give us information about the distance of }
// the label reference so we can pick a size more appropriately instead
// of just defaulting to 32 bits
break;
} // end OPERAND_IMMEDIATE case } // end OPERAND_IMMEDIATE case
} }
assert(false && "unreachable"); assert(false && "unreachable");
@ -338,7 +337,7 @@ error_t *encode_one_register_in_opcode(encoder_t *encoder,
(void)encoder; (void)encoder;
(void)opcode; (void)opcode;
register_id_t id = operands->children[0]->value.reg.id; register_id_t id = ast_node_register_value(operands->children[0])->id;
encoding->buffer[encoding->len - 1] |= id & 0b111; encoding->buffer[encoding->len - 1] |= id & 0b111;
if ((id & 0b1000) > 0) { if ((id & 0b1000) > 0) {
*rex |= rex_prefix_r; *rex |= rex_prefix_r;
@ -353,7 +352,7 @@ error_t *encode_one_register(encoder_t *encoder, opcode_data_t *opcode,
assert(operands->len == 1); assert(operands->len == 1);
assert(operands->children[0]->id == NODE_REGISTER); assert(operands->children[0]->id == NODE_REGISTER);
register_id_t id = operands->children[0]->value.reg.id; register_id_t id = ast_node_register_value(operands->children[0])->id;
uint8_t modrm = modrm_mod_register; uint8_t modrm = modrm_mod_register;
@ -387,9 +386,9 @@ error_t *encode_one_immediate(encoder_t *encoder, opcode_data_t *opcode,
assert(immediate->id == NODE_NUMBER || assert(immediate->id == NODE_NUMBER ||
immediate->id == NODE_LABEL_REFERENCE); immediate->id == NODE_LABEL_REFERENCE);
operand_size_t size = opcode->operands[0].size;
if (immediate->id == NODE_NUMBER) { if (immediate->id == NODE_NUMBER) {
uint64_t value = immediate->value.number.value; uint64_t value = ast_node_number_value(immediate)->value;
operand_size_t size = opcode->operands[0].size;
error_t *err = nullptr; error_t *err = nullptr;
switch (size) { switch (size) {
case OPERAND_SIZE_8: case OPERAND_SIZE_8:
@ -409,10 +408,21 @@ error_t *encode_one_immediate(encoder_t *encoder, opcode_data_t *opcode,
} }
return err; return err;
} else { } else {
// FIXME: this still assumes references are always 32 bit reference_t *reference = ast_node_reference_value(immediate);
uint32_t value = 0xDEADBEEF; switch (size) {
return bytes_append_uint32(encoding, value); case OPERAND_SIZE_64:
return bytes_append_uint64(encoding, reference->address);
case OPERAND_SIZE_32:
return bytes_append_uint32(encoding, reference->offset);
case OPERAND_SIZE_16:
return bytes_append_uint16(encoding, reference->offset);
case OPERAND_SIZE_8:
return bytes_append_uint8(encoding, reference->offset);
default:
assert(false && "intentionally unhandled");
}
} }
__builtin_unreachable();
} }
error_t *encode_one_memory(encoder_t *encoder, opcode_data_t *opcode, error_t *encode_one_memory(encoder_t *encoder, opcode_data_t *opcode,
@ -481,7 +491,8 @@ error_t *encoder_encode_instruction(encoder_t *encoder,
return err; return err;
// produce the actual encoding output in the NODE_INSTRUCTION value // produce the actual encoding output in the NODE_INSTRUCTION value
uint8_t *output = instruction->value.instruction.encoding.buffer; instruction_t *instruction_value = ast_node_instruction_value(instruction);
uint8_t *output = instruction_value->encoding.buffer;
size_t output_len = 0; size_t output_len = 0;
// Handle prefixes // Handle prefixes
@ -500,26 +511,166 @@ error_t *encoder_encode_instruction(encoder_t *encoder,
memcpy(output + output_len, encoding->buffer, encoding->len); memcpy(output + output_len, encoding->buffer, encoding->len);
output_len += encoding->len; output_len += encoding->len;
instruction->value.instruction.encoding.len = output_len; instruction_value->encoding.len = output_len;
return nullptr; return nullptr;
} }
/** /**
* Perform the second pass that performs actual encoding. Will use * Initial guess for instruction size of instructions that contain a label
* placeholder values for label references because instruction size has not * reference
* yet been determined.
*/ */
error_t *encoder_second_pass(encoder_t *encoder) { constexpr size_t instruction_size_estimate = 10;
/**
* Perform the initial pass over the AST.
*
* - Collect information about the operands
* - parse and set number values
* - set the register values
* - determine if label references are used by an instruction
* - encode instructions that don't use label references
* - determine estimated addresses of each statement
*
*/
error_t *encoder_first_pass(encoder_t *encoder) {
ast_node_t *root = encoder->ast; ast_node_t *root = encoder->ast;
assert(root->id == NODE_PROGRAM);
uintptr_t address = 0;
for (size_t i = 0; i < root->len; ++i) { for (size_t i = 0; i < root->len; ++i) {
if (root->children[i]->id != NODE_INSTRUCTION) ast_node_t *statement = root->children[i];
continue; error_t *err = encoder_collect_info(encoder, statement, statement);
ast_node_t *instruction = root->children[i];
error_t *err = encoder_encode_instruction(encoder, instruction);
if (err) if (err)
return err; return err;
if (statement->id == NODE_INSTRUCTION &&
ast_node_instruction_value(statement)->has_reference == false) {
err = encoder_encode_instruction(encoder, statement);
if (err)
return err;
instruction_t *instruction = ast_node_instruction_value(statement);
instruction->address = address;
address += instruction->encoding.len;
} else if (statement->id == NODE_INSTRUCTION) {
instruction_t *instruction = ast_node_instruction_value(statement);
instruction->encoding.len = instruction_size_estimate;
instruction->address = address;
address += instruction_size_estimate;
} else if (statement->id == NODE_LABEL) {
label_t *label = ast_node_label_value(statement);
label->address = address;
}
}
return nullptr;
}
operand_size_t signed_to_size_mask(int64_t value) {
operand_size_t size = OPERAND_SIZE_64;
if (value >= INT8_MIN && value <= INT8_MAX)
size |= OPERAND_SIZE_8;
if (value >= INT16_MIN && value <= INT16_MAX)
size |= OPERAND_SIZE_16;
if (value >= INT32_MIN && value <= INT32_MAX)
size |= OPERAND_SIZE_32;
return size;
}
int64_t statement_offset(ast_node_t *from, ast_node_t *to) {
assert(from->id == NODE_INSTRUCTION);
assert(to->id == NODE_LABEL);
instruction_t *instruction = ast_node_instruction_value(from);
int64_t from_addr = instruction->address + instruction->encoding.len;
int64_t to_addr = ast_node_label_value(to)->address;
return to_addr - from_addr;
}
error_t *encoder_collect_reference_info(encoder_t *encoder, ast_node_t *node,
ast_node_t *statement) {
assert(statement->id == NODE_INSTRUCTION);
if (node->id == NODE_LABEL_REFERENCE) {
const char *name = node->token_entry->token.value;
symbol_t *symbol = symbol_table_lookup(encoder->symbols, name);
assert(symbol && symbol->statement &&
symbol->statement->id == NODE_LABEL);
int64_t offset = statement_offset(statement, symbol->statement);
int64_t absolute = ast_node_label_value(symbol->statement)->address;
operand_size_t size = signed_to_size_mask(offset);
node->value.reference.address = absolute;
node->value.reference.offset = offset;
node->value.reference.size = size;
}
for (size_t i = 0; i < node->len; ++i) {
error_t *err = encoder_collect_reference_info(
encoder, node->children[i], statement);
if (err)
return err;
}
return nullptr;
}
bool encoder_should_reencode(ast_node_t *statement) {
if (statement->id != NODE_INSTRUCTION)
return false;
instruction_t *instruction = ast_node_instruction_value(statement);
return instruction->has_reference;
}
void set_statement_address(ast_node_t *statement, int64_t address) {
if (statement->id == NODE_INSTRUCTION) {
ast_node_instruction_value(statement)->address = address;
} else if (statement->id == NODE_LABEL) {
ast_node_label_value(statement)->address = address;
}
}
size_t get_statement_length(ast_node_t *statement) {
if (statement->id != NODE_INSTRUCTION)
return 0;
return ast_node_instruction_value(statement)->encoding.len;
}
/**
* Perform the second pass. Updates the label info and encodes all instructions
* that have a label reference.that performs actual encoding.
*/
error_t *encoder_second_pass(encoder_t *encoder, bool *did_update) {
ast_node_t *root = encoder->ast;
*did_update = false;
int64_t address = 0;
for (size_t i = 0; i < root->len; ++i) {
ast_node_t *statement = root->children[i];
set_statement_address(statement, address);
size_t before = get_statement_length(statement);
if (encoder_should_reencode(statement)) {
error_t *err =
encoder_collect_reference_info(encoder, statement, statement);
if (err)
return err;
err = encoder_encode_instruction(encoder, statement);
if (err)
return err;
}
size_t after = get_statement_length(statement);
*did_update = *did_update || (before != after);
address += after;
} }
return nullptr; return nullptr;
} }
@ -549,5 +700,12 @@ error_t *encoder_encode(encoder_t *encoder) {
err = encoder_check_symbols(encoder); err = encoder_check_symbols(encoder);
if (err) if (err)
return err; return err;
return encoder_second_pass(encoder);
bool did_update = true;
for (int i = 0; i < 10 && did_update; ++i) {
err = encoder_second_pass(encoder, &did_update);
if (err)
return err;
}
return nullptr;
} }