Compare commits

..

1 Commits

Author SHA1 Message Date
79e0120d52 Add statement index to the symbol table
Some checks failed
Validate the build / validate-build (push) Failing after 37s
This allows going from a label to the statement/address. Restructure the
encoder to deal with this and pass the correct statement index to the
symbol update.
2025-04-18 02:31:21 +02:00
5 changed files with 64 additions and 191 deletions

View File

@ -75,7 +75,7 @@ typedef struct register_ {
} register_t; } register_t;
typedef struct opcode_encoding { typedef struct opcode_encoding {
uint8_t buffer[32]; uint8_t encoding[32];
size_t len; size_t len;
} opcode_encoding_t; } opcode_encoding_t;
@ -89,19 +89,7 @@ struct ast_node {
union { union {
register_t reg; register_t reg;
number_t number; number_t number;
struct { opcode_encoding_t encoding;
bool has_reference;
opcode_encoding_t encoding;
int64_t address;
} instruction;
struct {
int64_t offset;
int64_t address;
operand_size_t size;
} reference;
struct {
int64_t address;
} label;
} value; } value;
}; };

View File

@ -6,31 +6,6 @@
#include <errno.h> #include <errno.h>
#include <string.h> #include <string.h>
/**
* General encoder flow:
*
* There are 2 major passes the encoder does:
*
* First pass:
* - Run through the AST and collect information:
* - Set register values
* - Parse/set number values
* - Mark all instructions that use label references
* - Encode all instructions that don't use label references
* - Update addresses of all labels and instructions. Use an estimated
* instruction size for those instructions that use label references.
*
* Second pass:
* - Run through the AST for all instructions that use label references and
* collect size information using the estimated addresses from pass 1
* - Encode label references with the estimated addresses, this fixes their
* size.
* - Update all addresses
*
* Iteration:
* - Repeat the second pass until addresses converge
*/
error_t *const err_encoder_invalid_register = error_t *const err_encoder_invalid_register =
&(error_t){.message = "Invalid register"}; &(error_t){.message = "Invalid register"};
error_t *const err_encoder_number_overflow = error_t *const err_encoder_number_overflow =
@ -241,14 +216,12 @@ static inline uint8_t modrm_rm(uint8_t modrm, register_id_t id) {
} }
error_t *encoder_collect_info(encoder_t *encoder, ast_node_t *node, error_t *encoder_collect_info(encoder_t *encoder, ast_node_t *node,
ast_node_t *statement) { size_t statement_index) {
error_t *err = nullptr; error_t *err = nullptr;
if (encoder_is_symbols_node(node)) { if (encoder_is_symbols_node(node))
err = symbol_table_update(encoder->symbols, node, statement); err = symbol_table_update(encoder->symbols, node, statement_index);
if (statement->id == NODE_INSTRUCTION) else if (node->id == NODE_NUMBER)
statement->value.instruction.has_reference = true;
} else if (node->id == NODE_NUMBER)
err = encoder_set_number_value(node); err = encoder_set_number_value(node);
else if (node->id == NODE_REGISTER) else if (node->id == NODE_REGISTER)
err = encoder_set_register_value(node); err = encoder_set_register_value(node);
@ -257,7 +230,7 @@ error_t *encoder_collect_info(encoder_t *encoder, ast_node_t *node,
for (size_t i = 0; i < node->len; ++i) { for (size_t i = 0; i < node->len; ++i) {
error_t *err = error_t *err =
encoder_collect_info(encoder, node->children[i], statement); encoder_collect_info(encoder, node->children[i], statement_index);
if (err) if (err)
return err; return err;
} }
@ -265,6 +238,33 @@ error_t *encoder_collect_info(encoder_t *encoder, ast_node_t *node,
return nullptr; return nullptr;
} }
/**
* Perform the initial pass over the AST.
*
* - Collect information about the operands
* - parse and set number values
* - set the register values
* - determine if label references are used by an instruction
* - encode instructions that don't use label references
* - determine estimated addresses of each statement
*
*/
error_t *encoder_first_pass(encoder_t *encoder) {
ast_node_t *root = encoder->ast;
assert(root->id == NODE_PROGRAM);
for (size_t i = 0; i < root->len; ++i) {
ast_node_t *statement = root->children[i];
error_t *err = encoder_collect_info(encoder, statement, i);
if (err)
return err;
if (statement->id != NODE_INSTRUCTION)
continue;
}
return nullptr;
}
bool is_operand_match(operand_info_t *info, ast_node_t *operand) { bool is_operand_match(operand_info_t *info, ast_node_t *operand) {
switch (info->kind) { switch (info->kind) {
case OPERAND_REGISTER: case OPERAND_REGISTER:
@ -483,7 +483,7 @@ error_t *encoder_encode_instruction(encoder_t *encoder,
return err; return err;
// produce the actual encoding output in the NODE_INSTRUCTION value // produce the actual encoding output in the NODE_INSTRUCTION value
uint8_t *output = instruction->value.instruction.encoding.buffer; uint8_t *output = instruction->value.encoding.encoding;
size_t output_len = 0; size_t output_len = 0;
// Handle prefixes // Handle prefixes
@ -502,136 +502,26 @@ error_t *encoder_encode_instruction(encoder_t *encoder,
memcpy(output + output_len, encoding->buffer, encoding->len); memcpy(output + output_len, encoding->buffer, encoding->len);
output_len += encoding->len; output_len += encoding->len;
instruction->value.instruction.encoding.len = output_len; instruction->value.encoding.len = output_len;
return nullptr; return nullptr;
} }
/** /**
* Perform the initial pass over the AST. * Perform the second pass that performs actual encoding. Will use
* * placeholder values for label references because instruction size has not
* - Collect information about the operands * yet been determined.
* - parse and set number values
* - set the register values
* - determine if label references are used by an instruction
* - encode instructions that don't use label references
* - determine estimated addresses of each statement
*
*/ */
constexpr size_t instruction_size_estimate = 10; error_t *encoder_second_pass(encoder_t *encoder) {
error_t *encoder_first_pass(encoder_t *encoder) {
ast_node_t *root = encoder->ast; ast_node_t *root = encoder->ast;
assert(root->id == NODE_PROGRAM);
uintptr_t address = 0;
for (size_t i = 0; i < root->len; ++i) { for (size_t i = 0; i < root->len; ++i) {
ast_node_t *statement = root->children[i]; if (root->children[i]->id != NODE_INSTRUCTION)
error_t *err = encoder_collect_info(encoder, statement, statement); continue;
ast_node_t *instruction = root->children[i];
error_t *err = encoder_encode_instruction(encoder, instruction);
if (err) if (err)
return err; return err;
if (statement->id == NODE_INSTRUCTION &&
statement->value.instruction.has_reference == false) {
err = encoder_encode_instruction(encoder, statement);
if (err)
return err;
statement->value.instruction.address = address;
address += statement->value.instruction.encoding.len;
} else if (statement->id == NODE_INSTRUCTION) {
statement->value.instruction.encoding.len =
instruction_size_estimate;
statement->value.instruction.address = address;
address += instruction_size_estimate;
} else if (statement->id == NODE_LABEL) {
statement->value.instruction.address = address;
}
}
return nullptr;
}
operand_size_t signed_to_size_mask(int64_t value) {
operand_size_t size = OPERAND_SIZE_64;
if (value >= INT8_MIN && value <= INT8_MAX)
size |= OPERAND_SIZE_8;
if (value >= INT16_MIN && value <= INT16_MAX)
size |= OPERAND_SIZE_16;
if (value >= INT32_MIN && value <= INT32_MAX)
size |= OPERAND_SIZE_32;
return size;
}
int64_t statement_offset(ast_node_t *from, ast_node_t *to) {
assert(from->id == NODE_LABEL || from->id == NODE_INSTRUCTION);
assert(to->id == NODE_LABEL || to->id == NODE_INSTRUCTION);
int64_t from_addr =
from->value.instruction.address + from->value.instruction.encoding.len;
int64_t to_addr = to->value.instruction.address;
return to_addr - from_addr;
}
error_t *encoder_collect_label_info(encoder_t *encoder, ast_node_t *node,
ast_node_t *statement) {
assert(statement->id == NODE_INSTRUCTION);
if (node->id == NODE_LABEL_REFERENCE) {
const char *name = node->token_entry->token.value;
symbol_t *symbol = symbol_table_lookup(encoder->symbols, name);
assert(symbol && symbol->statement &&
symbol->statement->id == NODE_LABEL);
int64_t offset = statement_offset(statement, symbol->statement);
int64_t absolute = symbol->statement->value.instruction.address;
operand_size_t size = signed_to_size_mask(offset);
node->value.reference.address = absolute;
node->value.reference.offset = offset;
node->value.reference.size = size;
}
return nullptr;
}
/**
* Perform the second pass. Updates the label info and encodes all instructions
* that have a label reference.that performs actual encoding.
*/
error_t *encoder_second_pass(encoder_t *encoder, bool *did_update) {
ast_node_t *root = encoder->ast;
*did_update = false;
int64_t address = 0;
for (size_t i = 0; i < root->len; ++i) {
ast_node_t *statement = root->children[i];
if (statement->id == NODE_INSTRUCTION &&
statement->value.instruction.has_reference) {
statement->value.instruction.address = address;
size_t before = statement->value.instruction.encoding.len;
error_t *err =
encoder_collect_label_info(encoder, statement, statement);
if (err)
return err;
err = encoder_encode_instruction(encoder, statement);
if (err)
return err;
size_t after = statement->value.instruction.encoding.len;
address += after;
*did_update = *did_update || (before != after);
} else if (statement->id == NODE_INSTRUCTION &&
statement->value.instruction.has_reference) {
statement->value.instruction.address = address;
address += statement->value.instruction.encoding.len;
} else if (statement->id == NODE_LABEL) {
statement->value.label.address = address;
}
address += statement->value.instruction.encoding.len;
} }
return nullptr; return nullptr;
} }
@ -661,12 +551,5 @@ error_t *encoder_encode(encoder_t *encoder) {
err = encoder_check_symbols(encoder); err = encoder_check_symbols(encoder);
if (err) if (err)
return err; return err;
return encoder_second_pass(encoder);
bool did_update = true;
for (int i = 0; i < 10 && did_update; ++i) {
err = encoder_second_pass(encoder, &did_update);
if (err)
return err;
}
return nullptr;
} }

View File

@ -92,7 +92,7 @@ EXPORT | | | ERR | |
-------------|-----------|----------|----------|----------| -------------|-----------|----------|----------|----------|
*/ */
bool symbol_table_should_upgrade(symbol_kind_t old, symbol_kind_t new) { bool symbol_table_should_update(symbol_kind_t old, symbol_kind_t new) {
if (old == SYMBOL_REFERENCE) if (old == SYMBOL_REFERENCE)
return new != SYMBOL_REFERENCE; return new != SYMBOL_REFERENCE;
if (old == SYMBOL_LOCAL) if (old == SYMBOL_LOCAL)
@ -112,7 +112,7 @@ bool symbol_table_should_error(symbol_kind_t old, symbol_kind_t new) {
* @pre The symbol _must not_ already be in the table. * @pre The symbol _must not_ already be in the table.
*/ */
error_t *symbol_table_add(symbol_table_t *table, char *name, symbol_kind_t kind, error_t *symbol_table_add(symbol_table_t *table, char *name, symbol_kind_t kind,
ast_node_t *statement) { ast_node_t *node, size_t statement_index) {
if (table->len >= table->cap) { if (table->len >= table->cap) {
error_t *err = symbol_table_grow_cap(table); error_t *err = symbol_table_grow_cap(table);
if (err) if (err)
@ -122,7 +122,8 @@ error_t *symbol_table_add(symbol_table_t *table, char *name, symbol_kind_t kind,
table->symbols[table->len] = (symbol_t){ table->symbols[table->len] = (symbol_t){
.name = name, .name = name,
.kind = kind, .kind = kind,
.statement = statement, .node = node,
.statement_index = statement_index,
}; };
table->len += 1; table->len += 1;
@ -131,28 +132,29 @@ error_t *symbol_table_add(symbol_table_t *table, char *name, symbol_kind_t kind,
} }
error_t *symbol_table_update(symbol_table_t *table, ast_node_t *node, error_t *symbol_table_update(symbol_table_t *table, ast_node_t *node,
ast_node_t *statement) { size_t statement_index) {
char *name; char *name;
symbol_kind_t kind; symbol_kind_t kind;
error_t *err = symbol_table_get_node_info(node, &kind, &name); error_t *err = symbol_table_get_node_info(node, &kind, &name);
if (err) if (err)
return err; return err;
if (kind != SYMBOL_LOCAL)
statement = nullptr;
symbol_t *symbol = symbol_table_lookup(table, name); symbol_t *symbol = symbol_table_lookup(table, name);
if (!symbol) if (!symbol)
return symbol_table_add(table, name, kind, statement); return symbol_table_add(table, name, kind, node, statement_index);
if (symbol_table_should_error(symbol->kind, kind)) if (symbol_table_should_error(symbol->kind, kind))
return err_symbol_table_incompatible_symbols; return err_symbol_table_incompatible_symbols;
if (symbol_table_should_upgrade(symbol->kind, kind)) { if (symbol_table_should_update(symbol->kind, kind)) {
symbol->name = name;
symbol->kind = kind; symbol->kind = kind;
symbol->node = node;
// Some deviation from the regular update, the most important statement
// to keep track of is the actual label even if it gets promoted beyond
// a local symbol because the label determines the address
if (kind == SYMBOL_LOCAL)
symbol->statement_index = statement_index;
} }
if (kind == SYMBOL_LOCAL && symbol->statement == nullptr)
symbol->statement = statement;
return nullptr; return nullptr;
} }

View File

@ -29,7 +29,8 @@ typedef enum symbol_kind {
typedef struct symbol { typedef struct symbol {
char *name; char *name;
symbol_kind_t kind; symbol_kind_t kind;
ast_node_t *statement; ast_node_t *node;
size_t statement_index;
} symbol_t; } symbol_t;
typedef struct symbol_table { typedef struct symbol_table {
@ -41,7 +42,7 @@ typedef struct symbol_table {
error_t *symbol_table_alloc(symbol_table_t **table); error_t *symbol_table_alloc(symbol_table_t **table);
void symbol_table_free(symbol_table_t *table); void symbol_table_free(symbol_table_t *table);
error_t *symbol_table_update(symbol_table_t *table, ast_node_t *node, error_t *symbol_table_update(symbol_table_t *table, ast_node_t *node,
ast_node_t *statement); size_t statement_index);
symbol_t *symbol_table_lookup(symbol_table_t *table, const char *name); symbol_t *symbol_table_lookup(symbol_table_t *table, const char *name);
#endif // INCLUDE_ENCODER_SYMBOLS_H_ #endif // INCLUDE_ENCODER_SYMBOLS_H_

View File

@ -88,8 +88,7 @@ error_t *print_encoding(tokenlist_t *list) {
if (node->id != NODE_INSTRUCTION) if (node->id != NODE_INSTRUCTION)
continue; continue;
print_hex(node->value.instruction.encoding.len, print_hex(node->value.encoding.len, node->value.encoding.encoding);
node->value.instruction.encoding.buffer);
} }
encoder_free(encoder); encoder_free(encoder);