Compare commits

...

3 Commits

Author SHA1 Message Date
fab5bedf3d Implement two pass encoding
Some checks failed
Validate the build / validate-build (push) Failing after 50s
First pass:
 - collect information for numbers, registers and which instructions
   contain label references
 - encode all instructions that don't contain label references
 - Set (temporary) addresses for each instruction

Second pass:
 - Collect information about label references (address, offset, size)
 - encode all instructions that contain label references
 - Update (if necessary) addresses for each instruction

 The second pass is iterated 10 times or until no instructions change
 size, whichever comes first.
2025-04-22 02:08:38 +02:00
9a1570e3e5 Add more values to the ast to facilitate encoding
- Add a instruction value that contains the encoding, the address and a
  flag to indicate if this instruction contains label references
- Add label value that contains an address
- Add reference value that contains offset, an absolute address and an
  operand size
2025-04-22 00:54:12 +02:00
9c6b69e187 Symbol table now keeps track of label statements
Before it kept track of a more specific node that referenced the symbol
in some way. Now it will only keep track of the actual label defining
statements. This is done to facilitate encoding. The encoder can now go
from a symbol name to the statement that defines the symbol.

Restructure the encoder to deal with this and pass the correct statement
to the symbol update function.
2025-04-18 14:00:08 +02:00
6 changed files with 207 additions and 40 deletions

View File

@ -75,7 +75,7 @@ typedef struct register_ {
} register_t;
typedef struct opcode_encoding {
uint8_t encoding[32];
uint8_t buffer[32];
size_t len;
} opcode_encoding_t;
@ -89,7 +89,19 @@ struct ast_node {
union {
register_t reg;
number_t number;
opcode_encoding_t encoding;
struct {
bool has_reference;
opcode_encoding_t encoding;
int64_t address;
} instruction;
struct {
int64_t offset;
int64_t address;
operand_size_t size;
} reference;
struct {
int64_t address;
} label;
} value;
};

View File

@ -6,6 +6,31 @@
#include <errno.h>
#include <string.h>
/**
* General encoder flow:
*
* There are 2 major passes the encoder does:
*
* First pass:
* - Run through the AST and collect information:
* - Set register values
* - Parse/set number values
* - Mark all instructions that use label references
* - Encode all instructions that don't use label references
* - Update addresses of all labels and instructions. Use an estimated
* instruction size for those instructions that use label references.
*
* Second pass:
* - Run through the AST for all instructions that use label references and
* collect size information using the estimated addresses from pass 1
* - Encode label references with the estimated addresses, this fixes their
* size.
* - Update all addresses
*
* Iteration:
* - Repeat the second pass until addresses converge
*/
error_t *const err_encoder_invalid_register =
&(error_t){.message = "Invalid register"};
error_t *const err_encoder_number_overflow =
@ -23,13 +48,15 @@ error_t *const err_encoder_not_implemented =
error_t *const err_encoder_unexpected_length =
&(error_t){.message = "Unexpectedly long encoding"};
error_t *encoder_alloc(encoder_t **output) {
error_t *encoder_alloc(encoder_t **output, ast_node_t *ast) {
*output = nullptr;
encoder_t *encoder = calloc(1, sizeof(encoder_t));
if (encoder == nullptr)
return err_allocation_failed;
encoder->ast = ast;
error_t *err = symbol_table_alloc(&encoder->symbols);
if (err) {
free(encoder);
@ -213,16 +240,15 @@ static inline uint8_t modrm_rm(uint8_t modrm, register_id_t id) {
return (modrm & ~modrm_rm_mask) | (id & 0b111);
}
/**
* Perform the initial pass over the AST. Records all symbols and sets the
* values of registers and numbers.
*/
error_t *encoder_first_pass(encoder_t *encoder, ast_node_t *node) {
error_t *encoder_collect_info(encoder_t *encoder, ast_node_t *node,
ast_node_t *statement) {
error_t *err = nullptr;
if (encoder_is_symbols_node(node))
err = symbol_table_update(encoder->symbols, node);
else if (node->id == NODE_NUMBER)
if (encoder_is_symbols_node(node)) {
err = symbol_table_update(encoder->symbols, node, statement);
if (statement->id == NODE_INSTRUCTION)
statement->value.instruction.has_reference = true;
} else if (node->id == NODE_NUMBER)
err = encoder_set_number_value(node);
else if (node->id == NODE_REGISTER)
err = encoder_set_register_value(node);
@ -230,7 +256,8 @@ error_t *encoder_first_pass(encoder_t *encoder, ast_node_t *node) {
return err;
for (size_t i = 0; i < node->len; ++i) {
error_t *err = encoder_first_pass(encoder, node->children[i]);
error_t *err =
encoder_collect_info(encoder, node->children[i], statement);
if (err)
return err;
}
@ -456,7 +483,7 @@ error_t *encoder_encode_instruction(encoder_t *encoder,
return err;
// produce the actual encoding output in the NODE_INSTRUCTION value
uint8_t *output = instruction->value.encoding.encoding;
uint8_t *output = instruction->value.instruction.encoding.buffer;
size_t output_len = 0;
// Handle prefixes
@ -475,24 +502,136 @@ error_t *encoder_encode_instruction(encoder_t *encoder,
memcpy(output + output_len, encoding->buffer, encoding->len);
output_len += encoding->len;
instruction->value.encoding.len = output_len;
instruction->value.instruction.encoding.len = output_len;
return nullptr;
}
/**
* Perform the second pass that performs actual encoding. Will use
* placeholder values for label references because instruction size has not
* yet been determined.
* Perform the initial pass over the AST.
*
* - Collect information about the operands
* - parse and set number values
* - set the register values
* - determine if label references are used by an instruction
* - encode instructions that don't use label references
* - determine estimated addresses of each statement
*
*/
error_t *encoder_encoding_pass(encoder_t *encoder, ast_node_t *root) {
constexpr size_t instruction_size_estimate = 10;
error_t *encoder_first_pass(encoder_t *encoder) {
ast_node_t *root = encoder->ast;
assert(root->id == NODE_PROGRAM);
uintptr_t address = 0;
for (size_t i = 0; i < root->len; ++i) {
if (root->children[i]->id != NODE_INSTRUCTION)
continue;
ast_node_t *instruction = root->children[i];
error_t *err = encoder_encode_instruction(encoder, instruction);
ast_node_t *statement = root->children[i];
error_t *err = encoder_collect_info(encoder, statement, statement);
if (err)
return err;
if (statement->id == NODE_INSTRUCTION &&
statement->value.instruction.has_reference == false) {
err = encoder_encode_instruction(encoder, statement);
if (err)
return err;
statement->value.instruction.address = address;
address += statement->value.instruction.encoding.len;
} else if (statement->id == NODE_INSTRUCTION) {
statement->value.instruction.encoding.len =
instruction_size_estimate;
statement->value.instruction.address = address;
address += instruction_size_estimate;
} else if (statement->id == NODE_LABEL) {
statement->value.instruction.address = address;
}
}
return nullptr;
}
operand_size_t signed_to_size_mask(int64_t value) {
operand_size_t size = OPERAND_SIZE_64;
if (value >= INT8_MIN && value <= INT8_MAX)
size |= OPERAND_SIZE_8;
if (value >= INT16_MIN && value <= INT16_MAX)
size |= OPERAND_SIZE_16;
if (value >= INT32_MIN && value <= INT32_MAX)
size |= OPERAND_SIZE_32;
return size;
}
int64_t statement_offset(ast_node_t *from, ast_node_t *to) {
assert(from->id == NODE_LABEL || from->id == NODE_INSTRUCTION);
assert(to->id == NODE_LABEL || to->id == NODE_INSTRUCTION);
int64_t from_addr =
from->value.instruction.address + from->value.instruction.encoding.len;
int64_t to_addr = to->value.instruction.address;
return to_addr - from_addr;
}
error_t *encoder_collect_label_info(encoder_t *encoder, ast_node_t *node,
ast_node_t *statement) {
assert(statement->id == NODE_INSTRUCTION);
if (node->id == NODE_LABEL_REFERENCE) {
const char *name = node->token_entry->token.value;
symbol_t *symbol = symbol_table_lookup(encoder->symbols, name);
assert(symbol && symbol->statement &&
symbol->statement->id == NODE_LABEL);
int64_t offset = statement_offset(statement, symbol->statement);
int64_t absolute = symbol->statement->value.instruction.address;
operand_size_t size = signed_to_size_mask(offset);
node->value.reference.address = absolute;
node->value.reference.offset = offset;
node->value.reference.size = size;
}
return nullptr;
}
/**
* Perform the second pass. Updates the label info and encodes all instructions
* that have a label reference.that performs actual encoding.
*/
error_t *encoder_second_pass(encoder_t *encoder, bool *did_update) {
ast_node_t *root = encoder->ast;
*did_update = false;
int64_t address = 0;
for (size_t i = 0; i < root->len; ++i) {
ast_node_t *statement = root->children[i];
if (statement->id == NODE_INSTRUCTION &&
statement->value.instruction.has_reference) {
statement->value.instruction.address = address;
size_t before = statement->value.instruction.encoding.len;
error_t *err =
encoder_collect_label_info(encoder, statement, statement);
if (err)
return err;
err = encoder_encode_instruction(encoder, statement);
if (err)
return err;
size_t after = statement->value.instruction.encoding.len;
address += after;
*did_update = *did_update || (before != after);
} else if (statement->id == NODE_INSTRUCTION &&
statement->value.instruction.has_reference) {
statement->value.instruction.address = address;
address += statement->value.instruction.encoding.len;
} else if (statement->id == NODE_LABEL) {
statement->value.label.address = address;
}
address += statement->value.instruction.encoding.len;
}
return nullptr;
}
@ -515,12 +654,19 @@ error_t *encoder_check_symbols(encoder_t *encoder) {
return nullptr;
}
error_t *encoder_encode(encoder_t *encoder, ast_node_t *ast) {
error_t *err = encoder_first_pass(encoder, ast);
error_t *encoder_encode(encoder_t *encoder) {
error_t *err = encoder_first_pass(encoder);
if (err)
return err;
err = encoder_check_symbols(encoder);
if (err)
return err;
return encoder_encoding_pass(encoder, ast);
bool did_update = true;
for (int i = 0; i < 10 && did_update; ++i) {
err = encoder_second_pass(encoder, &did_update);
if (err)
return err;
}
return nullptr;
}

View File

@ -5,6 +5,7 @@
typedef struct encoder {
symbol_table_t *symbols;
ast_node_t *ast;
} encoder_t;
constexpr uint8_t modrm_mod_memory = 0b00'000'000;
@ -16,8 +17,8 @@ constexpr uint8_t modrm_reg_mask = 0b00'111'000;
constexpr uint8_t modrm_rm_mask = 0b00'000'111;
constexpr uint8_t modrm_mod_mask = 0b11'000'000;
error_t *encoder_alloc(encoder_t **encoder);
error_t *encoder_encode(encoder_t *encoder, ast_node_t *ast);
error_t *encoder_alloc(encoder_t **encoder, ast_node_t *ast);
error_t *encoder_encode(encoder_t *encoder);
void encoder_free(encoder_t *encoder);
extern error_t *const err_encoder_invalid_register;

View File

@ -92,7 +92,7 @@ EXPORT | | | ERR | |
-------------|-----------|----------|----------|----------|
*/
bool symbol_table_should_update(symbol_kind_t old, symbol_kind_t new) {
bool symbol_table_should_upgrade(symbol_kind_t old, symbol_kind_t new) {
if (old == SYMBOL_REFERENCE)
return new != SYMBOL_REFERENCE;
if (old == SYMBOL_LOCAL)
@ -112,7 +112,7 @@ bool symbol_table_should_error(symbol_kind_t old, symbol_kind_t new) {
* @pre The symbol _must not_ already be in the table.
*/
error_t *symbol_table_add(symbol_table_t *table, char *name, symbol_kind_t kind,
ast_node_t *node) {
ast_node_t *statement) {
if (table->len >= table->cap) {
error_t *err = symbol_table_grow_cap(table);
if (err)
@ -122,7 +122,7 @@ error_t *symbol_table_add(symbol_table_t *table, char *name, symbol_kind_t kind,
table->symbols[table->len] = (symbol_t){
.name = name,
.kind = kind,
.node = node,
.statement = statement,
};
table->len += 1;
@ -130,23 +130,29 @@ error_t *symbol_table_add(symbol_table_t *table, char *name, symbol_kind_t kind,
return nullptr;
}
error_t *symbol_table_update(symbol_table_t *table, ast_node_t *node) {
error_t *symbol_table_update(symbol_table_t *table, ast_node_t *node,
ast_node_t *statement) {
char *name;
symbol_kind_t kind;
error_t *err = symbol_table_get_node_info(node, &kind, &name);
if (err)
return err;
if (kind != SYMBOL_LOCAL)
statement = nullptr;
symbol_t *symbol = symbol_table_lookup(table, name);
if (!symbol)
return symbol_table_add(table, name, kind, node);
return symbol_table_add(table, name, kind, statement);
if (symbol_table_should_error(symbol->kind, kind))
return err_symbol_table_incompatible_symbols;
if (symbol_table_should_update(symbol->kind, kind)) {
symbol->name = name;
if (symbol_table_should_upgrade(symbol->kind, kind)) {
symbol->kind = kind;
symbol->node = node;
}
if (kind == SYMBOL_LOCAL && symbol->statement == nullptr)
symbol->statement = statement;
return nullptr;
}

View File

@ -29,7 +29,7 @@ typedef enum symbol_kind {
typedef struct symbol {
char *name;
symbol_kind_t kind;
ast_node_t *node;
ast_node_t *statement;
} symbol_t;
typedef struct symbol_table {
@ -40,7 +40,8 @@ typedef struct symbol_table {
error_t *symbol_table_alloc(symbol_table_t **table);
void symbol_table_free(symbol_table_t *table);
error_t *symbol_table_update(symbol_table_t *table, ast_node_t *node);
error_t *symbol_table_update(symbol_table_t *table, ast_node_t *node,
ast_node_t *statement);
symbol_t *symbol_table_lookup(symbol_table_t *table, const char *name);
#endif // INCLUDE_ENCODER_SYMBOLS_H_

View File

@ -74,11 +74,11 @@ error_t *print_encoding(tokenlist_t *list) {
return result.err;
encoder_t *encoder;
error_t *err = encoder_alloc(&encoder);
error_t *err = encoder_alloc(&encoder, result.node);
if (err)
goto cleanup_ast;
err = encoder_encode(encoder, result.node);
err = encoder_encode(encoder);
if (err)
goto cleanup_ast;
@ -88,7 +88,8 @@ error_t *print_encoding(tokenlist_t *list) {
if (node->id != NODE_INSTRUCTION)
continue;
print_hex(node->value.encoding.len, node->value.encoding.encoding);
print_hex(node->value.instruction.encoding.len,
node->value.instruction.encoding.buffer);
}
encoder_free(encoder);