Compare commits

...

3 Commits

Author SHA1 Message Date
eab2202f23 SQUASH/REWORD symbols tests
All checks were successful
Validate the build / validate-build (push) Successful in 41s
2025-04-22 17:35:39 +02:00
fab5bedf3d Implement two pass encoding
Some checks failed
Validate the build / validate-build (push) Failing after 50s
First pass:
 - collect information for numbers, registers and which instructions
   contain label references
 - encode all instructions that don't contain label references
 - Set (temporary) addresses for each instruction

Second pass:
 - Collect information about label references (address, offset, size)
 - encode all instructions that contain label references
 - Update (if necessary) addresses for each instruction

 The second pass is iterated 10 times or until no instructions change
 size, whichever comes first.
2025-04-22 02:08:38 +02:00
9a1570e3e5 Add more values to the ast to facilitate encoding
- Add a instruction value that contains the encoding, the address and a
  flag to indicate if this instruction contains label references
- Add label value that contains an address
- Add reference value that contains offset, an absolute address and an
  operand size
2025-04-22 00:54:12 +02:00
4 changed files with 264 additions and 90 deletions

View File

@ -75,7 +75,7 @@ typedef struct register_ {
} register_t;
typedef struct opcode_encoding {
uint8_t encoding[32];
uint8_t buffer[32];
size_t len;
} opcode_encoding_t;
@ -89,7 +89,19 @@ struct ast_node {
union {
register_t reg;
number_t number;
opcode_encoding_t encoding;
struct {
bool has_reference;
opcode_encoding_t encoding;
int64_t address;
} instruction;
struct {
int64_t offset;
int64_t address;
operand_size_t size;
} reference;
struct {
int64_t address;
} label;
} value;
};

View File

@ -6,6 +6,31 @@
#include <errno.h>
#include <string.h>
/**
* General encoder flow:
*
* There are 2 major passes the encoder does:
*
* First pass:
* - Run through the AST and collect information:
* - Set register values
* - Parse/set number values
* - Mark all instructions that use label references
* - Encode all instructions that don't use label references
* - Update addresses of all labels and instructions. Use an estimated
* instruction size for those instructions that use label references.
*
* Second pass:
* - Run through the AST for all instructions that use label references and
* collect size information using the estimated addresses from pass 1
* - Encode label references with the estimated addresses, this fixes their
* size.
* - Update all addresses
*
* Iteration:
* - Repeat the second pass until addresses converge
*/
error_t *const err_encoder_invalid_register =
&(error_t){.message = "Invalid register"};
error_t *const err_encoder_number_overflow =
@ -219,9 +244,11 @@ error_t *encoder_collect_info(encoder_t *encoder, ast_node_t *node,
ast_node_t *statement) {
error_t *err = nullptr;
if (encoder_is_symbols_node(node))
if (encoder_is_symbols_node(node)) {
err = symbol_table_update(encoder->symbols, node, statement);
else if (node->id == NODE_NUMBER)
if (statement->id == NODE_INSTRUCTION)
statement->value.instruction.has_reference = true;
} else if (node->id == NODE_NUMBER)
err = encoder_set_number_value(node);
else if (node->id == NODE_REGISTER)
err = encoder_set_register_value(node);
@ -238,31 +265,6 @@ error_t *encoder_collect_info(encoder_t *encoder, ast_node_t *node,
return nullptr;
}
/**
* Perform the initial pass over the AST.
*
* - Collect information about the operands
* - parse and set number values
* - set the register values
* - determine if label references are used by an instruction
* - encode instructions that don't use label references
* - determine estimated addresses of each statement
*
*/
error_t *encoder_first_pass(encoder_t *encoder) {
ast_node_t *root = encoder->ast;
assert(root->id == NODE_PROGRAM);
for (size_t i = 0; i < root->len; ++i) {
ast_node_t *statement = root->children[i];
error_t *err = encoder_collect_info(encoder, statement, statement);
if (err)
return err;
}
return nullptr;
}
bool is_operand_match(operand_info_t *info, ast_node_t *operand) {
switch (info->kind) {
case OPERAND_REGISTER:
@ -481,7 +483,7 @@ error_t *encoder_encode_instruction(encoder_t *encoder,
return err;
// produce the actual encoding output in the NODE_INSTRUCTION value
uint8_t *output = instruction->value.encoding.encoding;
uint8_t *output = instruction->value.instruction.encoding.buffer;
size_t output_len = 0;
// Handle prefixes
@ -500,26 +502,136 @@ error_t *encoder_encode_instruction(encoder_t *encoder,
memcpy(output + output_len, encoding->buffer, encoding->len);
output_len += encoding->len;
instruction->value.encoding.len = output_len;
instruction->value.instruction.encoding.len = output_len;
return nullptr;
}
/**
* Perform the second pass that performs actual encoding. Will use
* placeholder values for label references because instruction size has not
* yet been determined.
* Perform the initial pass over the AST.
*
* - Collect information about the operands
* - parse and set number values
* - set the register values
* - determine if label references are used by an instruction
* - encode instructions that don't use label references
* - determine estimated addresses of each statement
*
*/
error_t *encoder_second_pass(encoder_t *encoder) {
constexpr size_t instruction_size_estimate = 10;
error_t *encoder_first_pass(encoder_t *encoder) {
ast_node_t *root = encoder->ast;
assert(root->id == NODE_PROGRAM);
uintptr_t address = 0;
for (size_t i = 0; i < root->len; ++i) {
if (root->children[i]->id != NODE_INSTRUCTION)
continue;
ast_node_t *instruction = root->children[i];
error_t *err = encoder_encode_instruction(encoder, instruction);
ast_node_t *statement = root->children[i];
error_t *err = encoder_collect_info(encoder, statement, statement);
if (err)
return err;
if (statement->id == NODE_INSTRUCTION &&
statement->value.instruction.has_reference == false) {
err = encoder_encode_instruction(encoder, statement);
if (err)
return err;
statement->value.instruction.address = address;
address += statement->value.instruction.encoding.len;
} else if (statement->id == NODE_INSTRUCTION) {
statement->value.instruction.encoding.len =
instruction_size_estimate;
statement->value.instruction.address = address;
address += instruction_size_estimate;
} else if (statement->id == NODE_LABEL) {
statement->value.instruction.address = address;
}
}
return nullptr;
}
operand_size_t signed_to_size_mask(int64_t value) {
operand_size_t size = OPERAND_SIZE_64;
if (value >= INT8_MIN && value <= INT8_MAX)
size |= OPERAND_SIZE_8;
if (value >= INT16_MIN && value <= INT16_MAX)
size |= OPERAND_SIZE_16;
if (value >= INT32_MIN && value <= INT32_MAX)
size |= OPERAND_SIZE_32;
return size;
}
int64_t statement_offset(ast_node_t *from, ast_node_t *to) {
assert(from->id == NODE_LABEL || from->id == NODE_INSTRUCTION);
assert(to->id == NODE_LABEL || to->id == NODE_INSTRUCTION);
int64_t from_addr =
from->value.instruction.address + from->value.instruction.encoding.len;
int64_t to_addr = to->value.instruction.address;
return to_addr - from_addr;
}
error_t *encoder_collect_label_info(encoder_t *encoder, ast_node_t *node,
ast_node_t *statement) {
assert(statement->id == NODE_INSTRUCTION);
if (node->id == NODE_LABEL_REFERENCE) {
const char *name = node->token_entry->token.value;
symbol_t *symbol = symbol_table_lookup(encoder->symbols, name);
assert(symbol && symbol->statement &&
symbol->statement->id == NODE_LABEL);
int64_t offset = statement_offset(statement, symbol->statement);
int64_t absolute = symbol->statement->value.instruction.address;
operand_size_t size = signed_to_size_mask(offset);
node->value.reference.address = absolute;
node->value.reference.offset = offset;
node->value.reference.size = size;
}
return nullptr;
}
/**
* Perform the second pass. Updates the label info and encodes all instructions
* that have a label reference.that performs actual encoding.
*/
error_t *encoder_second_pass(encoder_t *encoder, bool *did_update) {
ast_node_t *root = encoder->ast;
*did_update = false;
int64_t address = 0;
for (size_t i = 0; i < root->len; ++i) {
ast_node_t *statement = root->children[i];
if (statement->id == NODE_INSTRUCTION &&
statement->value.instruction.has_reference) {
statement->value.instruction.address = address;
size_t before = statement->value.instruction.encoding.len;
error_t *err =
encoder_collect_label_info(encoder, statement, statement);
if (err)
return err;
err = encoder_encode_instruction(encoder, statement);
if (err)
return err;
size_t after = statement->value.instruction.encoding.len;
address += after;
*did_update = *did_update || (before != after);
} else if (statement->id == NODE_INSTRUCTION &&
statement->value.instruction.has_reference) {
statement->value.instruction.address = address;
address += statement->value.instruction.encoding.len;
} else if (statement->id == NODE_LABEL) {
statement->value.label.address = address;
}
address += statement->value.instruction.encoding.len;
}
return nullptr;
}
@ -549,5 +661,12 @@ error_t *encoder_encode(encoder_t *encoder) {
err = encoder_check_symbols(encoder);
if (err)
return err;
return encoder_second_pass(encoder);
bool did_update = true;
for (int i = 0; i < 10 && did_update; ++i) {
err = encoder_second_pass(encoder, &did_update);
if (err)
return err;
}
return nullptr;
}

View File

@ -88,7 +88,8 @@ error_t *print_encoding(tokenlist_t *list) {
if (node->id != NODE_INSTRUCTION)
continue;
print_hex(node->value.encoding.len, node->value.encoding.encoding);
print_hex(node->value.instruction.encoding.len,
node->value.instruction.encoding.buffer);
}
encoder_free(encoder);

View File

@ -58,17 +58,19 @@ MunitResult test_symbol_add_reference(const MunitParameter params[], void *data)
symbol_table_alloc(&table);
ast_node_t *reference = root->children[3]->children[1]->children[0]->children[0];
ast_node_t *statement = root->children[3]; // The containing statement
munit_assert_int(reference->id, ==, NODE_LABEL_REFERENCE);
munit_assert_size(table->len, ==, 0);
error_t *err = symbol_table_update(table, reference);
error_t *err = symbol_table_update(table, reference, statement);
munit_assert_null(err);
munit_assert_size(table->len, ==, 1);
symbol_t *symbol = symbol_table_lookup(table, "test");
munit_assert_not_null(symbol);
munit_assert_int(SYMBOL_REFERENCE, ==, symbol->kind);
munit_assert_ptr_equal(reference, symbol->node);
// For references, the statement should be nullptr
munit_assert_ptr_null(symbol->statement);
munit_assert_string_equal(symbol->name, "test");
symbol_table_free(table);
@ -90,14 +92,14 @@ MunitResult test_symbol_add_label(const MunitParameter params[], void *data) {
munit_assert_int(label->id, ==, NODE_LABEL);
munit_assert_size(table->len, ==, 0);
error_t *err = symbol_table_update(table, label);
error_t *err = symbol_table_update(table, label, label);
munit_assert_null(err);
munit_assert_size(table->len, ==, 1);
symbol_t *symbol = symbol_table_lookup(table, "test");
munit_assert_not_null(symbol);
munit_assert_int(SYMBOL_LOCAL, ==, symbol->kind);
munit_assert_ptr_equal(label, symbol->node);
munit_assert_ptr_equal(label, symbol->statement);
munit_assert_string_equal(symbol->name, "test");
symbol_table_free(table);
@ -116,17 +118,19 @@ MunitResult test_symbol_add_import(const MunitParameter params[], void *data) {
symbol_table_alloc(&table);
ast_node_t *import_directive = root->children[0]->children[1];
ast_node_t *statement = root->children[0]; // The containing statement
munit_assert_int(import_directive->id, ==, NODE_IMPORT_DIRECTIVE);
munit_assert_size(table->len, ==, 0);
error_t *err = symbol_table_update(table, import_directive);
error_t *err = symbol_table_update(table, import_directive, statement);
munit_assert_null(err);
munit_assert_size(table->len, ==, 1);
symbol_t *symbol = symbol_table_lookup(table, "test");
munit_assert_not_null(symbol);
munit_assert_int(SYMBOL_IMPORT, ==, symbol->kind);
munit_assert_ptr_equal(import_directive, symbol->node);
// For import directives, the statement should be nullptr
munit_assert_ptr_null(symbol->statement);
munit_assert_string_equal(symbol->name, "test");
symbol_table_free(table);
@ -135,42 +139,56 @@ MunitResult test_symbol_add_import(const MunitParameter params[], void *data) {
return MUNIT_OK;
}
void test_symbol_update(const char *name, ast_node_t *first, symbol_kind_t first_kind, ast_node_t *second,
symbol_kind_t second_kind, bool should_succeed, bool should_update) {
void test_symbol_update(const char *name, ast_node_t *first, symbol_kind_t first_kind, ast_node_t *first_statement,
ast_node_t *second, symbol_kind_t second_kind, ast_node_t *second_statement,
bool should_succeed, bool should_update, ast_node_t *expected_statement) {
symbol_table_t *table = nullptr;
symbol_table_alloc(&table);
munit_assert_size(table->len, ==, 0);
error_t *err = symbol_table_update(table, first);
// Add the first symbol
error_t *err = symbol_table_update(table, first, first_statement);
munit_assert_null(err);
munit_assert_size(table->len, ==, 1);
// Verify first symbol state
symbol_t *symbol = symbol_table_lookup(table, name);
munit_assert_not_null(symbol);
munit_assert_int(first_kind, ==, symbol->kind);
munit_assert_ptr_equal(first, symbol->node);
munit_assert_string_equal(symbol->name, name);
err = symbol_table_update(table, second);
if (should_succeed)
munit_assert_null(err);
else
munit_assert_ptr_equal(err, err_symbol_table_incompatible_symbols);
munit_assert_size(table->len, ==, 1);
symbol = symbol_table_lookup(table, name);
if (should_update) {
munit_assert_not_null(symbol);
munit_assert_int(second_kind, ==, symbol->kind);
munit_assert_ptr_equal(second, symbol->node);
munit_assert_string_equal(symbol->name, name);
// Check statement based on symbol kind
if (first_kind == SYMBOL_LOCAL) {
munit_assert_ptr_equal(first_statement, symbol->statement);
} else {
munit_assert_not_null(symbol);
munit_assert_int(first_kind, ==, symbol->kind);
munit_assert_ptr_equal(first, symbol->node);
munit_assert_string_equal(symbol->name, name);
munit_assert_ptr_null(symbol->statement);
}
// Attempt the second update
err = symbol_table_update(table, second, second_statement);
// Check if update succeeded as expected
if (should_succeed) {
munit_assert_null(err);
} else {
munit_assert_ptr_equal(err, err_symbol_table_incompatible_symbols);
symbol_table_free(table);
return;
}
// Verify symbol after second update
symbol = symbol_table_lookup(table, name);
munit_assert_not_null(symbol);
// Check if kind updated as expected
if (should_update) {
munit_assert_int(second_kind, ==, symbol->kind);
} else {
munit_assert_int(first_kind, ==, symbol->kind);
}
// Simply check against the expected statement value
munit_assert_ptr_equal(expected_statement, symbol->statement);
symbol_table_free(table);
}
@ -181,28 +199,43 @@ MunitResult test_symbol_upgrade_valid(const MunitParameter params[], void *data)
symbols_setup_test(&root, &list, "tests/input/symbols.asm");
ast_node_t *reference = root->children[3]->children[1]->children[0]->children[0];
ast_node_t *reference_statement = root->children[3];
ast_node_t *label = root->children[2];
ast_node_t *import_directive = root->children[0]->children[1];
ast_node_t *import_statement = root->children[0];
ast_node_t *export_directive = root->children[1]->children[1];
ast_node_t *export_statement = root->children[1];
// real upgrades
test_symbol_update("test", reference, SYMBOL_REFERENCE, label, SYMBOL_LOCAL, true, true);
test_symbol_update("test", reference, SYMBOL_REFERENCE, import_directive, SYMBOL_IMPORT, true, true);
test_symbol_update("test", reference, SYMBOL_REFERENCE, export_directive, SYMBOL_EXPORT, true, true);
test_symbol_update("test", label, SYMBOL_LOCAL, export_directive, SYMBOL_EXPORT, true, true);
test_symbol_update("test", reference, SYMBOL_REFERENCE, reference_statement, label, SYMBOL_LOCAL, label, true, true,
label);
test_symbol_update("test", reference, SYMBOL_REFERENCE, reference_statement, import_directive, SYMBOL_IMPORT,
import_statement, true, true, nullptr);
test_symbol_update("test", reference, SYMBOL_REFERENCE, reference_statement, export_directive, SYMBOL_EXPORT,
export_statement, true, true, nullptr);
test_symbol_update("test", label, SYMBOL_LOCAL, label, export_directive, SYMBOL_EXPORT, export_statement, true,
true, label);
// identity upgrades
test_symbol_update("test", reference, SYMBOL_REFERENCE, reference, SYMBOL_REFERENCE, true, false);
test_symbol_update("test", label, SYMBOL_LOCAL, label, SYMBOL_LOCAL, true, false);
test_symbol_update("test", import_directive, SYMBOL_IMPORT, import_directive, SYMBOL_IMPORT, true, false);
test_symbol_update("test", export_directive, SYMBOL_EXPORT, export_directive, SYMBOL_EXPORT, true, false);
test_symbol_update("test", reference, SYMBOL_REFERENCE, reference_statement, reference, SYMBOL_REFERENCE,
reference_statement, true, false, nullptr);
test_symbol_update("test", label, SYMBOL_LOCAL, label, label, SYMBOL_LOCAL, label, true, false, label);
test_symbol_update("test", import_directive, SYMBOL_IMPORT, import_statement, import_directive, SYMBOL_IMPORT,
import_statement, true, false, nullptr);
test_symbol_update("test", export_directive, SYMBOL_EXPORT, export_statement, export_directive, SYMBOL_EXPORT,
export_statement, true, false, nullptr);
// downgrades that are allowed and ignored
test_symbol_update("test", label, SYMBOL_LOCAL, reference, SYMBOL_REFERENCE, true, false);
test_symbol_update("test", import_directive, SYMBOL_IMPORT, reference, SYMBOL_REFERENCE, true, false);
test_symbol_update("test", export_directive, SYMBOL_EXPORT, reference, SYMBOL_REFERENCE, true, false);
test_symbol_update("test", export_directive, SYMBOL_EXPORT, label, SYMBOL_LOCAL, true, false);
test_symbol_update("test", import_directive, SYMBOL_IMPORT, label, SYMBOL_LOCAL, true, false);
test_symbol_update("test", label, SYMBOL_LOCAL, label, reference, SYMBOL_REFERENCE, reference_statement, true,
false, label);
test_symbol_update("test", import_directive, SYMBOL_IMPORT, import_statement, reference, SYMBOL_REFERENCE,
reference_statement, true, false, nullptr);
test_symbol_update("test", export_directive, SYMBOL_EXPORT, export_statement, reference, SYMBOL_REFERENCE,
reference_statement, true, false, nullptr);
test_symbol_update("test", export_directive, SYMBOL_EXPORT, export_statement, label, SYMBOL_LOCAL, label, true,
false, label);
test_symbol_update("test", import_directive, SYMBOL_IMPORT, import_statement, label, SYMBOL_LOCAL, label, true,
false, label);
ast_node_free(root);
tokenlist_free(list);
@ -216,14 +249,20 @@ MunitResult test_symbol_upgrade_invalid(const MunitParameter params[], void *dat
symbols_setup_test(&root, &list, "tests/input/symbols.asm");
ast_node_t *reference = root->children[3]->children[1]->children[0]->children[0];
ast_node_t *reference_statement = root->children[3];
ast_node_t *label = root->children[2];
ast_node_t *import_directive = root->children[0]->children[1];
ast_node_t *import_statement = root->children[0];
ast_node_t *export_directive = root->children[1]->children[1];
ast_node_t *export_statement = root->children[1];
// invalid upgrades
test_symbol_update("test", label, SYMBOL_LOCAL, import_directive, SYMBOL_IMPORT, false, false);
test_symbol_update("test", export_directive, SYMBOL_EXPORT, import_directive, SYMBOL_IMPORT, false, false);
test_symbol_update("test", import_directive, SYMBOL_IMPORT, export_directive, SYMBOL_EXPORT, false, false);
test_symbol_update("test", label, SYMBOL_LOCAL, label, import_directive, SYMBOL_IMPORT, import_statement, false,
false, nullptr);
test_symbol_update("test", export_directive, SYMBOL_EXPORT, export_statement, import_directive, SYMBOL_IMPORT,
import_statement, false, false, nullptr);
test_symbol_update("test", import_directive, SYMBOL_IMPORT, import_statement, export_directive, SYMBOL_EXPORT,
export_statement, false, false, nullptr);
ast_node_free(root);
tokenlist_free(list);
@ -240,17 +279,19 @@ MunitResult test_symbol_add_export(const MunitParameter params[], void *data) {
symbol_table_alloc(&table);
ast_node_t *export_directive = root->children[1]->children[1];
ast_node_t *statement = root->children[1]; // The containing statement
munit_assert_int(export_directive->id, ==, NODE_EXPORT_DIRECTIVE);
munit_assert_size(table->len, ==, 0);
error_t *err = symbol_table_update(table, export_directive);
error_t *err = symbol_table_update(table, export_directive, statement);
munit_assert_null(err);
munit_assert_size(table->len, ==, 1);
symbol_t *symbol = symbol_table_lookup(table, "test");
munit_assert_not_null(symbol);
munit_assert_int(SYMBOL_EXPORT, ==, symbol->kind);
munit_assert_ptr_equal(export_directive, symbol->node);
// For export directives, the statement should be nullptr
munit_assert_ptr_null(symbol->statement);
munit_assert_string_equal(symbol->name, "test");
symbol_table_free(table);
@ -280,7 +321,7 @@ MunitResult test_symbol_table_growth(const MunitParameter params[], void *data)
ast_node_t *label = root->children[i];
munit_assert_int(label->id, ==, NODE_LABEL);
error_t *err = symbol_table_update(table, label);
error_t *err = symbol_table_update(table, label, label);
munit_assert_null(err);
munit_assert_size(table->len, ==, i + 1);
@ -292,7 +333,7 @@ MunitResult test_symbol_table_growth(const MunitParameter params[], void *data)
ast_node_t *final_label = root->children[64];
munit_assert_int(final_label->id, ==, NODE_LABEL);
error_t *err = symbol_table_update(table, final_label);
error_t *err = symbol_table_update(table, final_label, final_label);
munit_assert_null(err);
munit_assert_size(table->len, ==, 65);
@ -308,6 +349,7 @@ MunitResult test_symbol_table_growth(const MunitParameter params[], void *data)
munit_assert_not_null(symbol);
munit_assert_int(SYMBOL_LOCAL, ==, symbol->kind);
munit_assert_string_equal(symbol->name, name);
munit_assert_ptr_equal(symbol->statement, root->children[i]);
}
symbol_table_free(table);
@ -326,7 +368,7 @@ MunitResult test_symbol_invalid_node(const MunitParameter params[], void *data)
symbol_table_alloc(&table);
munit_assert_size(table->len, ==, 0);
error_t *err = symbol_table_update(table, root);
error_t *err = symbol_table_update(table, root, root);
munit_assert_ptr_equal(err, err_symbol_table_invalid_node);
munit_assert_size(table->len, ==, 0);