Initial encoding implementation #19

Merged
omicron merged 14 commits from encoding_start into main 2025-04-16 22:16:32 +00:00
23 changed files with 1970 additions and 48 deletions

View File

@ -3,10 +3,14 @@
<label> ::= <identifier> <colon>
<directive> ::= <dot> <section_directive> <newline>
<directive> ::= <dot> (<section_directive> | <export_directive> | <import_directive> ) <newline>
<section_directive> ::= "section" <identifier>
<export_directive> ::= "export" <identifier>
<import_directive> ::= "import" <identifier>
<instruction> ::= <identifier> <operands> <newline>
<operands> ::= <operand> ( <comma> <operand> )*

View File

@ -123,6 +123,10 @@ const char *ast_node_id_to_cstr(node_id_t id) {
return "NODE_PLUS_OR_MINUS";
case NODE_SECTION_DIRECTIVE:
return "NODE_SECTION_DIRECTIVE";
case NODE_IMPORT_DIRECTIVE:
return "NODE_IMPORT_DIRECTIVE";
case NODE_EXPORT_DIRECTIVE:
return "NODE_EXPORT_DIRECTIVE";
case NODE_REGISTER:
return "NODE_REGISTER";
case NODE_SECTION:
@ -159,6 +163,10 @@ const char *ast_node_id_to_cstr(node_id_t id) {
return "NODE_DOT";
case NODE_NEWLINE:
return "NODE_NEWLINE";
case NODE_IMPORT:
return "NODE_IMPORT";
case NODE_EXPORT:
return "NODE_EXPORT";
}
assert(!"Unreachable, weird node id" && id);
__builtin_unreachable();

View File

@ -1,6 +1,7 @@
#ifndef INCLUDE_SRC_AST_H_
#define INCLUDE_SRC_AST_H_
#include "data/registers.h"
#include "error.h"
#include "lexer.h"
#include "tokenlist.h"
@ -29,10 +30,14 @@ typedef enum node_id {
NODE_REGISTER_OFFSET,
NODE_PLUS_OR_MINUS,
NODE_SECTION_DIRECTIVE,
NODE_IMPORT_DIRECTIVE,
NODE_EXPORT_DIRECTIVE,
// Validated primitives
NODE_REGISTER,
NODE_SECTION,
NODE_IMPORT,
NODE_EXPORT,
// Primitive nodes
NODE_IDENTIFIER,
@ -59,6 +64,21 @@ constexpr size_t node_default_children_cap = 8;
/* 65K ought to be enough for anybody */
constexpr size_t node_max_children_cap = 1 << 16;
typedef struct number {
uint64_t value;
operand_size_t size;
} number_t;
typedef struct register_ {
register_id_t id;
operand_size_t size;
} register_t;
typedef struct opcode_encoding {
uint8_t encoding[32];
size_t len;
} opcode_encoding_t;
struct ast_node {
node_id_t id;
tokenlist_entry_t *token_entry;
@ -67,11 +87,9 @@ struct ast_node {
ast_node_t **children;
union {
struct {
uint64_t value;
int size;
} integer;
char *name;
register_t reg;
number_t number;
opcode_encoding_t encoding;
} value;
};

6
src/bytes.c Normal file
View File

@ -0,0 +1,6 @@
#include "bytes.h"
#include "error.h"
error_t *const err_bytes_no_capacity = &(error_t){
.message = "Not enough capacity in bytes buffer",
};

60
src/bytes.h Normal file
View File

@ -0,0 +1,60 @@
#ifndef INCLUDE_SRC_BYTES_H_
#define INCLUDE_SRC_BYTES_H_
#include "error.h"
#include <stddef.h>
#include <stdint.h>
#include <string.h>
extern error_t *const err_bytes_no_capacity;
typedef struct bytes {
size_t len;
size_t cap;
uint8_t buffer[];
} bytes_t;
#define LOCAL_BYTES_ANONYMOUS(N) \
&(struct { \
size_t len; \
size_t cap; \
uint8_t buffer[(N)]; \
}) { \
0, (N), {} \
}
#define LOCAL_BYTES(N) (bytes_t *)LOCAL_BYTES_ANONYMOUS(N);
static inline error_t *bytes_append_uint8(bytes_t *bytes, uint8_t value) {
if (bytes->len >= bytes->cap)
return err_bytes_no_capacity;
bytes->buffer[bytes->len++] = value;
return nullptr;
}
static inline error_t *bytes_append_array(bytes_t *dst, size_t n,
uint8_t buffer[static n]) {
if (dst->len + n >= dst->cap)
return err_bytes_no_capacity;
memcpy(dst->buffer + dst->len, buffer, n);
dst->len += n;
return nullptr;
}
static inline error_t *bytes_append_bytes(bytes_t *dst, bytes_t *src) {
return bytes_append_array(dst, src->len, src->buffer);
}
static inline error_t *bytes_append_uint16(bytes_t *dst, uint16_t value) {
return bytes_append_array(dst, sizeof(value), (uint8_t *)&value);
}
static inline error_t *bytes_append_uint32(bytes_t *dst, uint32_t value) {
return bytes_append_array(dst, sizeof(value), (uint8_t *)&value);
}
static inline error_t *bytes_append_uint64(bytes_t *dst, uint64_t value) {
return bytes_append_array(dst, sizeof(value), (uint8_t *)&value);
}
#endif // INCLUDE_SRC_BYTES_H_

145
src/data/opcodes.c Normal file
View File

@ -0,0 +1,145 @@
#include "opcodes.h"
// clang-format off
opcode_data_t *const opcodes[] = {
// RET
&(opcode_data_t) {
.mnemonic = "ret",
.opcode = 0xC3,
.opcode_extension = opcode_extension_none,
.operand_count = 0,
},
// RET imm16
&(opcode_data_t) {
.mnemonic = "ret",
.opcode = 0xC2,
.opcode_extension = opcode_extension_none,
.operand_count = 1,
.operands = {
{ .kind = OPERAND_IMMEDIATE, .size = OPERAND_SIZE_16 },
},
},
// PUSH imm8
&(opcode_data_t) {
.mnemonic = "push",
.opcode = 0x6A,
.opcode_extension = opcode_extension_none,
.operand_count = 1,
.operands = {
{ .kind = OPERAND_IMMEDIATE, .size = OPERAND_SIZE_8},
},
},
// PUSH imm16
&(opcode_data_t) {
.mnemonic = "push",
.opcode = 0x68,
.opcode_extension = opcode_extension_none,
.operand_size_prefix = true,
.operand_count = 1,
.operands = {
{ .kind = OPERAND_IMMEDIATE, .size = OPERAND_SIZE_16},
},
},
// PUSH imm32
&(opcode_data_t) {
.mnemonic = "push",
.opcode = 0x68,
.opcode_extension = opcode_extension_none,
.operand_size_prefix = false,
.operand_count = 1,
.operands = {
{ .kind = OPERAND_IMMEDIATE, .size = OPERAND_SIZE_32},
},
},
// PUSH reg16,
&(opcode_data_t) {
.mnemonic = "push",
.opcode = 0x50,
.opcode_extension = opcode_extension_none,
.encoding_class = ENCODING_OPCODE_REGISTER,
.operand_count = 1,
.operands = {
{ .kind = OPERAND_REGISTER, .size = OPERAND_SIZE_16 },
},
},
// PUSH reg64
&(opcode_data_t) {
.mnemonic = "push",
.opcode = 0x50,
.opcode_extension = opcode_extension_none,
.encoding_class = ENCODING_OPCODE_REGISTER,
.operand_count = 1,
.operands = {
{ .kind = OPERAND_REGISTER, .size = OPERAND_SIZE_64 },
},
},
// NOT reg16
&(opcode_data_t) {
.mnemonic = "not",
.opcode = 0xF7,
.opcode_extension = 2,
.operand_size_prefix = true,
.operand_count = 1,
.operands = {
{ .kind = OPERAND_REGISTER, .size = OPERAND_SIZE_16 },
},
},
// NOT reg32
&(opcode_data_t) {
.mnemonic = "not",
.opcode = 0xF7,
.opcode_extension = 2,
.operand_count = 1,
.operands = {
{ .kind = OPERAND_REGISTER, .size = OPERAND_SIZE_32 },
},
},
// NOT reg64
&(opcode_data_t) {
.mnemonic = "not",
.opcode = 0xF7,
.opcode_extension = 2,
.rex_w_prefix = true,
.operand_count = 1,
.operands = {
{ .kind = OPERAND_REGISTER, .size = OPERAND_SIZE_64 },
},
},
// NEG reg16
&(opcode_data_t) {
.mnemonic = "neg",
.opcode = 0xF7,
.opcode_extension = 3,
.operand_size_prefix = true,
.operand_count = 1,
.operands = {
{ .kind = OPERAND_REGISTER, .size = OPERAND_SIZE_16 },
},
},
// NEG reg32
&(opcode_data_t) {
.mnemonic = "neg",
.opcode = 0xF7,
.opcode_extension = 3,
.operand_count = 1,
.operands = {
{ .kind = OPERAND_REGISTER, .size = OPERAND_SIZE_32 },
},
},
// NEG reg64
&(opcode_data_t) {
.mnemonic = "neg",
.opcode = 0xF7,
.opcode_extension = 3,
.rex_w_prefix = true,
.operand_count = 1,
.operands = {
{ .kind = OPERAND_REGISTER, .size = OPERAND_SIZE_64 },
},
},
nullptr,
};

56
src/data/opcodes.h Normal file
View File

@ -0,0 +1,56 @@
#ifndef INCLUDE_DATA_OPCODES_H_
#define INCLUDE_DATA_OPCODES_H_
#include "../data/registers.h"
#include <stddef.h>
#include <stdint.h>
constexpr uint8_t rex_prefix = 0x40;
constexpr uint8_t rex_prefix_w = 0x48;
constexpr uint8_t rex_prefix_r = 0x44;
constexpr uint8_t rex_prefix_x = 0x42;
constexpr uint8_t rex_prefix_b = 0x41;
constexpr uint8_t operand_size_prefix = 0x66;
constexpr uint8_t memory_size_prefix = 0x67;
constexpr uint8_t lock_prefix = 0xF0;
constexpr uint8_t repne_prefix = 0xF2;
constexpr uint8_t rep_prefix = 0xF3;
typedef enum encoding_class {
ENCODING_DEFAULT, // use modrm+sib for registers and memory, append
// immediates
ENCODING_OPCODE_REGISTER, // encode the register in the last 3 bits of the
// opcode
} encoding_class_t;
typedef enum operand_kind {
OPERAND_REGISTER,
OPERAND_MEMORY,
OPERAND_IMMEDIATE,
} operand_kind_t;
typedef struct operand_info {
operand_kind_t kind;
operand_size_t size;
} operand_info_t;
constexpr uint8_t opcode_extension_none = 0xFF;
typedef struct opcode_data {
const char *mnemonic;
uint16_t opcode;
uint8_t opcode_extension; // 3 bits for the opcode extension in the reg
// field of a modr/m byte
encoding_class_t encoding_class;
bool operand_size_prefix;
bool address_size_prefix;
bool rex_w_prefix;
size_t operand_count;
operand_info_t operands[3];
} opcode_data_t;
extern opcode_data_t *const opcodes[];
#endif // INCLUDE_DATA_OPCODES_H_

92
src/data/registers.c Normal file
View File

@ -0,0 +1,92 @@
#include "registers.h"
register_data_t *const registers[] = {
// Instruction pointer
&(register_data_t){"rip", REG_RIP, OPERAND_SIZE_64},
&(register_data_t){"eip", REG_RIP, OPERAND_SIZE_32},
&(register_data_t){"ip", REG_RIP, OPERAND_SIZE_16},
// 64-bit general purpose registers
&(register_data_t){"rax", REG_A, OPERAND_SIZE_64},
&(register_data_t){"rcx", REG_C, OPERAND_SIZE_64},
&(register_data_t){"rdx", REG_D, OPERAND_SIZE_64},
&(register_data_t){"rbx", REG_B, OPERAND_SIZE_64},
&(register_data_t){"rsp", REG_SP, OPERAND_SIZE_64},
&(register_data_t){"rbp", REG_BP, OPERAND_SIZE_64},
&(register_data_t){"rsi", REG_SI, OPERAND_SIZE_64},
&(register_data_t){"rdi", REG_DI, OPERAND_SIZE_64},
&(register_data_t){"r8", REG_8, OPERAND_SIZE_64},
&(register_data_t){"r9", REG_9, OPERAND_SIZE_64},
&(register_data_t){"r10", REG_10, OPERAND_SIZE_64},
&(register_data_t){"r11", REG_11, OPERAND_SIZE_64},
&(register_data_t){"r12", REG_12, OPERAND_SIZE_64},
&(register_data_t){"r13", REG_13, OPERAND_SIZE_64},
&(register_data_t){"r14", REG_14, OPERAND_SIZE_64},
&(register_data_t){"r15", REG_15, OPERAND_SIZE_64},
// 32-bit general purpose registers
&(register_data_t){"eax", REG_A, OPERAND_SIZE_32},
&(register_data_t){"ecx", REG_C, OPERAND_SIZE_32},
&(register_data_t){"edx", REG_D, OPERAND_SIZE_32},
&(register_data_t){"ebx", REG_B, OPERAND_SIZE_32},
&(register_data_t){"esp", REG_SP, OPERAND_SIZE_32},
&(register_data_t){"ebp", REG_BP, OPERAND_SIZE_32},
&(register_data_t){"esi", REG_SI, OPERAND_SIZE_32},
&(register_data_t){"edi", REG_DI, OPERAND_SIZE_32},
&(register_data_t){"r8d", REG_8, OPERAND_SIZE_32},
&(register_data_t){"r9d", REG_9, OPERAND_SIZE_32},
&(register_data_t){"r10d", REG_10, OPERAND_SIZE_32},
&(register_data_t){"r11d", REG_11, OPERAND_SIZE_32},
&(register_data_t){"r12d", REG_12, OPERAND_SIZE_32},
&(register_data_t){"r13d", REG_13, OPERAND_SIZE_32},
&(register_data_t){"r14d", REG_14, OPERAND_SIZE_32},
&(register_data_t){"r15d", REG_15, OPERAND_SIZE_32},
// 16-bit general purpose registers
&(register_data_t){"ax", REG_A, OPERAND_SIZE_16},
&(register_data_t){"cx", REG_C, OPERAND_SIZE_16},
&(register_data_t){"dx", REG_D, OPERAND_SIZE_16},
&(register_data_t){"bx", REG_B, OPERAND_SIZE_16},
&(register_data_t){"sp", REG_SP, OPERAND_SIZE_16},
&(register_data_t){"bp", REG_BP, OPERAND_SIZE_16},
&(register_data_t){"si", REG_SI, OPERAND_SIZE_16},
&(register_data_t){"di", REG_DI, OPERAND_SIZE_16},
&(register_data_t){"r8w", REG_8, OPERAND_SIZE_16},
&(register_data_t){"r9w", REG_9, OPERAND_SIZE_16},
&(register_data_t){"r10w", REG_10, OPERAND_SIZE_16},
&(register_data_t){"r11w", REG_11, OPERAND_SIZE_16},
&(register_data_t){"r12w", REG_12, OPERAND_SIZE_16},
&(register_data_t){"r13w", REG_13, OPERAND_SIZE_16},
&(register_data_t){"r14w", REG_14, OPERAND_SIZE_16},
&(register_data_t){"r15w", REG_15, OPERAND_SIZE_16},
// 8-bit general purpose registers (low byte)
&(register_data_t){"al", REG_A, OPERAND_SIZE_8 },
&(register_data_t){"cl", REG_C, OPERAND_SIZE_8 },
&(register_data_t){"dl", REG_D, OPERAND_SIZE_8 },
&(register_data_t){"bl", REG_B, OPERAND_SIZE_8 },
&(register_data_t){"spl", REG_SP, OPERAND_SIZE_8 },
&(register_data_t){"bpl", REG_BP, OPERAND_SIZE_8 },
&(register_data_t){"sil", REG_SI, OPERAND_SIZE_8 },
&(register_data_t){"dil", REG_DI, OPERAND_SIZE_8 },
&(register_data_t){"r8b", REG_8, OPERAND_SIZE_8 },
&(register_data_t){"r9b", REG_9, OPERAND_SIZE_8 },
&(register_data_t){"r10b", REG_10, OPERAND_SIZE_8 },
&(register_data_t){"r11b", REG_11, OPERAND_SIZE_8 },
&(register_data_t){"r12b", REG_12, OPERAND_SIZE_8 },
&(register_data_t){"r13b", REG_13, OPERAND_SIZE_8 },
&(register_data_t){"r14b", REG_14, OPERAND_SIZE_8 },
&(register_data_t){"r15b", REG_15, OPERAND_SIZE_8 },
// x87 floating point registers
&(register_data_t){"st0", REG_ST0, OPERAND_SIZE_80},
&(register_data_t){"st1", REG_ST1, OPERAND_SIZE_80},
&(register_data_t){"st2", REG_ST2, OPERAND_SIZE_80},
&(register_data_t){"st3", REG_ST3, OPERAND_SIZE_80},
&(register_data_t){"st4", REG_ST4, OPERAND_SIZE_80},
&(register_data_t){"st5", REG_ST5, OPERAND_SIZE_80},
&(register_data_t){"st6", REG_ST6, OPERAND_SIZE_80},
&(register_data_t){"st7", REG_ST7, OPERAND_SIZE_80},
nullptr,
};

82
src/data/registers.h Normal file
View File

@ -0,0 +1,82 @@
#ifndef INCLUDE_DATA_REGISTERS_H_
#define INCLUDE_DATA_REGISTERS_H_
typedef enum operand_size {
OPERAND_SIZE_INVALID = 0,
OPERAND_SIZE_8 = 1 << 0,
OPERAND_SIZE_16 = 1 << 1,
OPERAND_SIZE_32 = 1 << 2,
OPERAND_SIZE_64 = 1 << 3,
OPERAND_SIZE_80 = 1 << 4,
OPERAND_SIZE_128 = 1 << 5,
OPERAND_SIZE_256 = 1 << 6,
OPERAND_SIZE_512 = 1 << 7,
} operand_size_t;
static inline operand_size_t bits_to_operand_size(int bits) {
switch (bits) {
case 8:
return OPERAND_SIZE_8;
case 16:
return OPERAND_SIZE_16;
case 32:
return OPERAND_SIZE_32;
case 64:
return OPERAND_SIZE_64;
case 80:
return OPERAND_SIZE_80;
case 128:
return OPERAND_SIZE_128;
case 256:
return OPERAND_SIZE_256;
case 512:
return OPERAND_SIZE_512;
default:
return OPERAND_SIZE_INVALID;
}
}
typedef enum register_id {
// Special registers
REG_RIP = -1,
// General purpose registers
REG_A = 0x0000,
REG_C,
REG_D,
REG_B,
REG_SP,
REG_BP,
REG_SI,
REG_DI,
REG_8,
REG_9,
REG_10,
REG_11,
REG_12,
REG_13,
REG_14,
REG_15,
REG_ST0 = 0x1000,
REG_ST1,
REG_ST2,
REG_ST3,
REG_ST4,
REG_ST5,
REG_ST6,
REG_ST7,
} register_id_t;
typedef struct register_data {
const char *name;
register_id_t id;
operand_size_t size;
} register_data_t;
extern register_data_t *const registers[];
#endif // INCLUDE_DATA_REGISTERS_H_

526
src/encoder/encoder.c Normal file
View File

@ -0,0 +1,526 @@
#include "encoder.h"
#include "../bytes.h"
#include "../data/opcodes.h"
#include "symbols.h"
#include <assert.h>
#include <errno.h>
#include <string.h>
error_t *const err_encoder_invalid_register =
&(error_t){.message = "Invalid register"};
error_t *const err_encoder_number_overflow =
&(error_t){.message = "Number overflows the storage"};
error_t *const err_encoder_invalid_number_format =
&(error_t){.message = "Invalid number format"};
error_t *const err_encoder_invalid_size_suffix =
&(error_t){.message = "Invalid number size suffix"};
error_t *const err_encoder_unknown_symbol_reference =
&(error_t){.message = "Referenced an unknown symbol"};
error_t *const err_encoder_no_encoding_found =
&(error_t){.message = "No encoding found for instruction"};
error_t *const err_encoder_not_implemented =
&(error_t){.message = "Implementation for this opcode is missing"};
error_t *const err_encoder_unexpected_length =
&(error_t){.message = "Unexpectedly long encoding"};
error_t *encoder_alloc(encoder_t **output) {
*output = nullptr;
encoder_t *encoder = calloc(1, sizeof(encoder_t));
if (encoder == nullptr)
return err_allocation_failed;
error_t *err = symbol_table_alloc(&encoder->symbols);
if (err) {
free(encoder);
return err;
}
*output = encoder;
return nullptr;
}
void encoder_free(encoder_t *encoder) {
if (encoder == nullptr)
return;
symbol_table_free(encoder->symbols);
free(encoder);
}
bool encoder_is_symbols_node(ast_node_t *node) {
switch (node->id) {
case NODE_LABEL:
case NODE_LABEL_REFERENCE:
case NODE_EXPORT_DIRECTIVE:
case NODE_IMPORT_DIRECTIVE:
return true;
default:
return false;
}
}
int encoder_get_number_base(ast_node_t *number) {
switch (number->children[0]->id) {
case NODE_BINARY:
return 2;
case NODE_OCTAL:
return 8;
case NODE_DECIMAL:
return 10;
case NODE_HEXADECIMAL:
return 16;
default:
assert(false);
}
__builtin_unreachable();
}
bool is_valid_size_suffix(int bits) {
switch (bits) {
case 0:
case 8:
case 16:
case 32:
case 64:
return true;
default:
return false;
}
}
bool is_overflow(uint64_t value, int bits) {
if (bits == 0 || bits >= 64)
return false;
uint64_t max_value = (1ULL << bits) - 1;
return value > max_value;
}
operand_size_t encoder_get_size_mask(uint64_t value, int bits) {
if (bits != 0)
return bits_to_operand_size(bits);
operand_size_t mask = OPERAND_SIZE_64;
if (value < (1ULL << 8))
mask |= OPERAND_SIZE_8;
if (value < (1ULL << 16))
mask |= OPERAND_SIZE_16;
if (value < (1ULL << 32))
mask |= OPERAND_SIZE_32;
return mask;
}
error_t *encoder_set_number_value(ast_node_t *node) {
assert(node->id == NODE_NUMBER);
assert(node->children[0]);
const char *number = node->children[0]->token_entry->token.value;
int base = encoder_get_number_base(node);
if (base != 10)
number += 2; // all except base 10 use a 0x, 0o or 0b prefix
char *endptr;
errno = 0;
uint64_t value = strtoull(number, &endptr, base);
if (errno == ERANGE)
return err_encoder_number_overflow;
if (endptr == number)
return err_encoder_invalid_number_format;
int bits = 0;
if (*endptr == ':') {
const char *suffix = endptr + 1;
bits = strtol(suffix, &endptr, 10);
if (endptr == suffix)
return err_encoder_invalid_number_format;
}
if (*endptr != '\0')
return err_encoder_invalid_number_format;
if (!is_valid_size_suffix(bits))
return err_encoder_invalid_size_suffix;
if (is_overflow(value, bits))
return err_encoder_number_overflow;
node->value.number.value = value;
node->value.number.size = encoder_get_size_mask(value, bits);
return nullptr;
}
error_t *encoder_set_register_value(ast_node_t *node) {
assert(node->id == NODE_REGISTER);
const char *value = node->token_entry->token.value;
for (size_t i = 0; registers[i] != nullptr; ++i) {
if (strcmp(value, registers[i]->name) == 0) {
node->value.reg.id = registers[i]->id;
node->value.reg.size = registers[i]->size;
return nullptr;
}
}
return err_encoder_invalid_register;
}
/**
* Set the opcode extension in the modrm field
*/
static inline uint8_t modrm_extension(uint8_t modrm, uint8_t extension) {
assert(extension != opcode_extension_none);
assert((extension & 0b111) == extension);
return (modrm & ~modrm_reg_mask) | extension << 3;
}
/**
* Return the rex bit for reg field in modrm
*/
static inline uint8_t modrm_reg_rex(uint8_t rex, register_id_t id) {
if (id & 0b1000)
rex |= rex_prefix_r;
return rex;
}
/**
* update modrm reg field with the given register, must be used alongside
* modrm_reg_rex
*/
static inline uint8_t modrm_reg(uint8_t modrm, register_id_t id) {
return (modrm & ~modrm_reg_mask) | (id & 0b111) << 3;
}
/**
* Return the rex bit for rm field in modrm
*/
static inline uint8_t modrm_rm_rex(uint8_t rex, register_id_t id) {
if (id & 0b1000)
rex |= rex_prefix_b;
return rex;
}
/**
* update modrm rm field with the given register, must be used alongside
* modrm_rm_rex
*/
static inline uint8_t modrm_rm(uint8_t modrm, register_id_t id) {
assert((modrm & modrm_mod_mask) == modrm_mod_register);
return (modrm & ~modrm_rm_mask) | (id & 0b111);
}
/**
* Perform the initial pass over the AST. Records all symbols and sets the
* values of registers and numbers.
*/
error_t *encoder_first_pass(encoder_t *encoder, ast_node_t *node) {
error_t *err = nullptr;
if (encoder_is_symbols_node(node))
err = symbol_table_update(encoder->symbols, node);
else if (node->id == NODE_NUMBER)
err = encoder_set_number_value(node);
else if (node->id == NODE_REGISTER)
err = encoder_set_register_value(node);
if (err)
return err;
for (size_t i = 0; i < node->len; ++i) {
error_t *err = encoder_first_pass(encoder, node->children[i]);
if (err)
return err;
}
return nullptr;
}
bool is_operand_match(operand_info_t *info, ast_node_t *operand) {
switch (info->kind) {
case OPERAND_REGISTER:
return operand->id == NODE_REGISTER &&
operand->value.reg.size == info->size;
case OPERAND_MEMORY:
return operand->id == NODE_MEMORY;
case OPERAND_IMMEDIATE: {
if (operand->id != NODE_IMMEDIATE)
return false;
ast_node_t *child = operand->children[0];
if (child->id == NODE_NUMBER)
return (child->value.number.size & info->size) > 0;
else if (child->id == NODE_LABEL_REFERENCE)
return info->size == OPERAND_SIZE_32;
// FIXME: first pass should give us information about the distance of
// the label reference so we can pick a size more appropriately instead
// of just defaulting to 32 bits
break;
} // end OPERAND_IMMEDIATE case
}
assert(false && "unreachable");
__builtin_unreachable();
}
bool is_opcode_match(opcode_data_t *opcode, const char *mnemonic,
ast_node_t *operands) {
if (strcmp(opcode->mnemonic, mnemonic) != 0)
return false;
if (opcode->operand_count != operands->len)
return false;
for (size_t i = 0; i < operands->len; ++i) {
if (!is_operand_match(&opcode->operands[i], operands->children[i]))
return false;
}
return true;
}
error_t *encoder_get_opcode_data(ast_node_t *instruction, ast_node_t *operands,
opcode_data_t **opcode_out) {
const char *mnemonic = instruction->children[0]->token_entry->token.value;
for (size_t i = 0; opcodes[i]; ++i) {
opcode_data_t *opcode = opcodes[i];
if (is_opcode_match(opcode, mnemonic, operands)) {
*opcode_out = opcode;
return nullptr;
}
}
return err_encoder_no_encoding_found;
}
error_t *encode_two_operand(encoder_t *encoder, opcode_data_t *opcode,
ast_node_t *operands, bytes_t *encoding,
uint8_t *rex) {
(void)encoder;
(void)opcode;
(void)operands;
(void)encoding;
(void)rex;
assert(encoding->len >= 1 && "must have 1+ opcode byte in buffer already");
return err_encoder_not_implemented;
}
error_t *encode_one_register_in_opcode(encoder_t *encoder,
opcode_data_t *opcode,
ast_node_t *operands, bytes_t *encoding,
uint8_t *rex) {
(void)encoder;
(void)opcode;
register_id_t id = operands->children[0]->value.reg.id;
encoding->buffer[encoding->len - 1] |= id & 0b111;
if ((id & 0b1000) > 0) {
*rex |= rex_prefix_r;
}
return nullptr;
}
error_t *encode_one_register(encoder_t *encoder, opcode_data_t *opcode,
ast_node_t *operands, bytes_t *encoding,
uint8_t *rex) {
(void)encoder;
assert(operands->len == 1);
assert(operands->children[0]->id == NODE_REGISTER);
register_id_t id = operands->children[0]->value.reg.id;
uint8_t modrm = modrm_mod_register;
if (opcode->opcode_extension != opcode_extension_none) {
// register goes in rm field, extension goes in mod field
modrm = modrm_extension(modrm, opcode->opcode_extension);
modrm = modrm_rm(modrm, id);
*rex = modrm_rm_rex(*rex, id);
} else {
// register goes in reg field
// NOTE:
// it's actually likely this case just doesn't exist at all and all
// opcodes that take one register in modr/m _all_ have extended opcdes
modrm = modrm_reg(modrm, id);
*rex = modrm_reg_rex(*rex, id);
}
return bytes_append_uint8(encoding, modrm);
}
error_t *encode_one_immediate(encoder_t *encoder, opcode_data_t *opcode,
ast_node_t *operands, bytes_t *encoding,
uint8_t *rex) {
(void)encoder;
(void)opcode;
(void)rex;
assert(operands->len == 1);
assert(operands->children[0]->id == NODE_IMMEDIATE);
assert(operands->children[0]->len == 1);
ast_node_t *immediate = operands->children[0]->children[0];
assert(immediate->id == NODE_NUMBER ||
immediate->id == NODE_LABEL_REFERENCE);
if (immediate->id == NODE_NUMBER) {
uint64_t value = immediate->value.number.value;
operand_size_t size = opcode->operands[0].size;
error_t *err = nullptr;
switch (size) {
case OPERAND_SIZE_8:
err = bytes_append_uint8(encoding, value);
break;
case OPERAND_SIZE_16:
err = bytes_append_uint16(encoding, value);
break;
case OPERAND_SIZE_32:
err = bytes_append_uint32(encoding, value);
break;
case OPERAND_SIZE_64:
err = bytes_append_uint64(encoding, value);
break;
default:
assert(false && "intentionally unhandled");
}
return err;
} else {
// FIXME: this still assumes references are always 32 bit
uint32_t value = 0xDEADBEEF;
return bytes_append_uint32(encoding, value);
}
}
error_t *encode_one_memory(encoder_t *encoder, opcode_data_t *opcode,
ast_node_t *operands, bytes_t *encoding,
uint8_t *rex) {
(void)encoder;
(void)opcode;
(void)operands;
(void)encoding;
(void)rex;
return err_encoder_not_implemented;
}
error_t *encode_one_operand(encoder_t *encoder, opcode_data_t *opcode,
ast_node_t *operands, bytes_t *encoding,
uint8_t *rex) {
switch (opcode->operands[0].kind) {
case OPERAND_REGISTER:
if (opcode->encoding_class == ENCODING_OPCODE_REGISTER)
return encode_one_register_in_opcode(encoder, opcode, operands,
encoding, rex);
else
return encode_one_register(encoder, opcode, operands, encoding,
rex);
case OPERAND_MEMORY:
return encode_one_memory(encoder, opcode, operands, encoding, rex);
case OPERAND_IMMEDIATE:
return encode_one_immediate(encoder, opcode, operands, encoding, rex);
}
}
error_t *encoder_encode_instruction(encoder_t *encoder,
ast_node_t *instruction) {
ast_node_t *operands = instruction->children[1];
opcode_data_t *opcode = nullptr;
error_t *err = encoder_get_opcode_data(instruction, operands, &opcode);
if (err)
return err;
uint8_t rex = 0;
bytes_t *encoding = LOCAL_BYTES(32);
if (opcode->opcode > 0xFF &&
(err = bytes_append_uint8(encoding, opcode->opcode >> 8)))
return err;
if ((err = bytes_append_uint8(encoding, opcode->opcode & 0xFF)))
return err;
// NOTE:operand encoders all expect the opcode to be in the buffer already.
// Some of them rely on this to encode the register value in the opcode
// byte.
switch (opcode->operand_count) {
case 0:
break;
case 1:
err = encode_one_operand(encoder, opcode, operands, encoding, &rex);
break;
case 2:
err = encode_two_operand(encoder, opcode, operands, encoding, &rex);
break;
default:
err = err_encoder_not_implemented;
}
if (err)
return err;
// produce the actual encoding output in the NODE_INSTRUCTION value
uint8_t *output = instruction->value.encoding.encoding;
size_t output_len = 0;
// Handle prefixes
if (opcode->rex_w_prefix)
rex = rex_prefix_w;
if (opcode->address_size_prefix)
output[output_len++] = memory_size_prefix;
if (opcode->operand_size_prefix)
output[output_len++] = operand_size_prefix;
if (rex > 0)
output[output_len++] = rex;
// copy the encoded opcode and operands
if (encoding->len > 20)
return err_encoder_unexpected_length;
memcpy(output + output_len, encoding->buffer, encoding->len);
output_len += encoding->len;
instruction->value.encoding.len = output_len;
return nullptr;
}
/**
* Perform the second pass that performs actual encoding. Will use
* placeholder values for label references because instruction size has not
* yet been determined.
*/
error_t *encoder_encoding_pass(encoder_t *encoder, ast_node_t *root) {
for (size_t i = 0; i < root->len; ++i) {
if (root->children[i]->id != NODE_INSTRUCTION)
continue;
ast_node_t *instruction = root->children[i];
error_t *err = encoder_encode_instruction(encoder, instruction);
if (err)
return err;
}
return nullptr;
}
opcode_data_t *encoder_find_opcode(ast_node_t *instruction) {
for (size_t i = 0; opcodes[i] != nullptr; ++i) {
const char *mnemonic =
instruction->children[0]->token_entry->token.value;
ast_node_t *operands = instruction->children[1];
if (is_opcode_match(opcodes[i], mnemonic, operands))
return opcodes[i];
}
return nullptr;
}
error_t *encoder_check_symbols(encoder_t *encoder) {
for (size_t i = 0; i < encoder->symbols->len; ++i)
if (encoder->symbols->symbols[i].kind == SYMBOL_REFERENCE)
return err_encoder_unknown_symbol_reference;
return nullptr;
}
error_t *encoder_encode(encoder_t *encoder, ast_node_t *ast) {
error_t *err = encoder_first_pass(encoder, ast);
if (err)
return err;
err = encoder_check_symbols(encoder);
if (err)
return err;
return encoder_encoding_pass(encoder, ast);
}

32
src/encoder/encoder.h Normal file
View File

@ -0,0 +1,32 @@
#ifndef INCLUDE_ENCODER_ENCODER_H_
#define INCLUDE_ENCODER_ENCODER_H_
#include "symbols.h"
typedef struct encoder {
symbol_table_t *symbols;
} encoder_t;
constexpr uint8_t modrm_mod_memory = 0b00'000'000;
constexpr uint8_t modrm_mod_memory_displacement8 = 0b01'000'000;
constexpr uint8_t modrm_mod_memory_displacement32 = 0b10'000'000;
constexpr uint8_t modrm_mod_register = 0b11'000'000;
constexpr uint8_t modrm_reg_mask = 0b00'111'000;
constexpr uint8_t modrm_rm_mask = 0b00'000'111;
constexpr uint8_t modrm_mod_mask = 0b11'000'000;
error_t *encoder_alloc(encoder_t **encoder);
error_t *encoder_encode(encoder_t *encoder, ast_node_t *ast);
void encoder_free(encoder_t *encoder);
extern error_t *const err_encoder_invalid_register;
extern error_t *const err_encoder_number_overflow;
extern error_t *const err_encoder_invalid_number_format;
extern error_t *const err_encoder_invalid_size_suffix;
extern error_t *const err_encoder_unknown_symbol_reference;
extern error_t *const err_encoder_no_encoding_found;
extern error_t *const err_encoder_not_implemented;
extern error_t *const err_encoder_unexpected_length;
#endif // INCLUDE_ENCODER_ENCODER_H_

159
src/encoder/symbols.c Normal file
View File

@ -0,0 +1,159 @@
#include "symbols.h"
#include "../error.h"
#include <assert.h>
#include <stdlib.h>
#include <string.h>
constexpr size_t symbol_table_default_cap = 64;
constexpr size_t symbol_table_max_cap = 1 << 16;
error_t *const err_symbol_table_invalid_node = &(error_t){
.message = "Unexpected node id when adding symbol to symbol table"};
error_t *const err_symbol_table_max_cap = &(error_t){
.message = "Failed to increase symbol table length, max capacity reached"};
error_t *const err_symbol_table_incompatible_symbols =
&(error_t){.message = "Failed to update symbol with incompatible kind"};
error_t *symbol_table_alloc(symbol_table_t **output) {
*output = nullptr;
symbol_table_t *table = calloc(1, sizeof(symbol_table_t));
if (table == nullptr)
return err_allocation_failed;
table->symbols = calloc(symbol_table_default_cap, sizeof(symbol_t));
if (table->symbols == nullptr) {
free(table);
return err_allocation_failed;
}
table->cap = symbol_table_default_cap;
table->len = 0;
*output = table;
return nullptr;
}
void symbol_table_free(symbol_table_t *table) {
free(table->symbols);
free(table);
}
error_t *symbol_table_grow_cap(symbol_table_t *table) {
if (table->cap >= symbol_table_max_cap)
return err_symbol_table_max_cap;
size_t new_cap = table->cap * 2;
symbol_t *new_symbols = realloc(table->symbols, new_cap * sizeof(symbol_t));
if (new_symbols == nullptr)
return err_allocation_failed;
table->symbols = new_symbols;
table->cap = new_cap;
return nullptr;
}
error_t *symbol_table_get_node_info(ast_node_t *node, symbol_kind_t *kind,
char **name) {
switch (node->id) {
case NODE_LABEL:
*kind = SYMBOL_LOCAL;
*name = node->children[0]->token_entry->token.value;
return nullptr;
case NODE_LABEL_REFERENCE:
*kind = SYMBOL_REFERENCE;
*name = node->token_entry->token.value;
return nullptr;
case NODE_IMPORT_DIRECTIVE:
*kind = SYMBOL_IMPORT;
*name = node->children[1]->token_entry->token.value;
return nullptr;
case NODE_EXPORT_DIRECTIVE:
*kind = SYMBOL_EXPORT;
*name = node->children[1]->token_entry->token.value;
return nullptr;
default:
return err_symbol_table_invalid_node;
}
__builtin_unreachable();
}
/*
old \ new | REFERENCE | LOCAL | IMPORT | EXPORT |
-------------|-----------|----------|----------|----------|
REFERENCE | | replace | replace | replace |
-------------|-----------|----------|----------|----------|
LOCAL | | | ERR | replace |
-------------|-----------|----------|----------|----------|
IMPORT | | | | ERR |
-------------|-----------|----------|----------|----------|
EXPORT | | | ERR | |
-------------|-----------|----------|----------|----------|
*/
bool symbol_table_should_update(symbol_kind_t old, symbol_kind_t new) {
if (old == SYMBOL_REFERENCE)
return new != SYMBOL_REFERENCE;
if (old == SYMBOL_LOCAL)
return new == SYMBOL_EXPORT;
return false;
}
bool symbol_table_should_error(symbol_kind_t old, symbol_kind_t new) {
if (new == SYMBOL_IMPORT)
return old == SYMBOL_LOCAL || old == SYMBOL_EXPORT;
if (new == SYMBOL_EXPORT)
return old == SYMBOL_IMPORT;
return false;
}
/**
* @pre The symbol _must not_ already be in the table.
*/
error_t *symbol_table_add(symbol_table_t *table, char *name, symbol_kind_t kind,
ast_node_t *node) {
if (table->len >= table->cap) {
error_t *err = symbol_table_grow_cap(table);
if (err)
return err;
}
table->symbols[table->len] = (symbol_t){
.name = name,
.kind = kind,
.node = node,
};
table->len += 1;
return nullptr;
}
error_t *symbol_table_update(symbol_table_t *table, ast_node_t *node) {
char *name;
symbol_kind_t kind;
error_t *err = symbol_table_get_node_info(node, &kind, &name);
if (err)
return err;
symbol_t *symbol = symbol_table_lookup(table, name);
if (!symbol)
return symbol_table_add(table, name, kind, node);
if (symbol_table_should_error(symbol->kind, kind))
return err_symbol_table_incompatible_symbols;
if (symbol_table_should_update(symbol->kind, kind)) {
symbol->name = name;
symbol->kind = kind;
symbol->node = node;
}
return nullptr;
}
symbol_t *symbol_table_lookup(symbol_table_t *table, const char *name) {
for (size_t i = 0; i < table->len; ++i) {
if (strcmp(table->symbols[i].name, name) == 0)
return &table->symbols[i];
}
return nullptr;
}

46
src/encoder/symbols.h Normal file
View File

@ -0,0 +1,46 @@
#ifndef INCLUDE_ENCODER_SYMBOLS_H_
#define INCLUDE_ENCODER_SYMBOLS_H_
#include "../ast.h"
extern error_t *const err_symbol_table_invalid_node;
extern error_t *const err_symbol_table_max_cap;
extern error_t *const err_symbol_table_incompatible_symbols;
typedef enum symbol_kind {
SYMBOL_REFERENCE,
SYMBOL_LOCAL,
SYMBOL_EXPORT,
SYMBOL_IMPORT,
} symbol_kind_t;
/**
* Represent a symbol in the program
*
* Symbols with the same name can only be in the table once. IMPORT or EXPORT
* symbols take precedence over REFERENCE symbols. If any reference symbols
* remain after the first encoding pass this indicates an error. Trying to add
* an IMPORT or EXPORT symbol if the same name already exists as the other kind
* is an error.
*
* This symbol table never taken ownership of the name string, it's lifted
* straight from the node->token.value.
*/
typedef struct symbol {
char *name;
symbol_kind_t kind;
ast_node_t *node;
} symbol_t;
typedef struct symbol_table {
size_t cap;
size_t len;
symbol_t *symbols;
} symbol_table_t;
error_t *symbol_table_alloc(symbol_table_t **table);
void symbol_table_free(symbol_table_t *table);
error_t *symbol_table_update(symbol_table_t *table, ast_node_t *node);
symbol_t *symbol_table_lookup(symbol_table_t *table, const char *name);
#endif // INCLUDE_ENCODER_SYMBOLS_H_

View File

@ -1,3 +1,5 @@
#include "ast.h"
#include "encoder/encoder.h"
#include "error.h"
#include "lexer.h"
#include "parser/parser.h"
@ -8,7 +10,13 @@
#include <stdlib.h>
#include <string.h>
typedef enum mode { MODE_AST, MODE_TEXT, MODE_TOKENS } mode_t;
typedef enum mode {
MODE_INVALID = -1,
MODE_AST,
MODE_TEXT,
MODE_TOKENS,
MODE_ENCODING,
} mode_t;
void print_tokens(tokenlist_t *list) {
for (auto entry = list->head; entry; entry = entry->next) {
@ -32,39 +40,105 @@ void print_text(tokenlist_t *list) {
}
}
void print_ast(tokenlist_t *list) {
error_t *print_ast(tokenlist_t *list) {
parse_result_t result = parse(list->head);
if (result.err) {
puts(result.err->message);
error_free(result.err);
return;
}
if (result.err)
return result.err;
ast_node_print(result.node);
if (result.next != nullptr) {
puts("First unparsed token:");
lexer_token_print(&result.next->token);
}
ast_node_free(result.node);
if (result.next != nullptr) {
return errorf("did not parse entire input token stream");
}
return nullptr;
}
void print_hex(size_t len, uint8_t bytes[static len]) {
for (size_t i = 0; i < len; i++) {
printf("%02x", bytes[i]);
if (i < len - 1) {
printf(" ");
}
}
printf("\n");
}
error_t *print_encoding(tokenlist_t *list) {
parse_result_t result = parse(list->head);
if (result.err)
return result.err;
encoder_t *encoder;
error_t *err = encoder_alloc(&encoder);
if (err)
goto cleanup_ast;
err = encoder_encode(encoder, result.node);
if (err)
goto cleanup_ast;
ast_node_t *root = result.node;
for (size_t i = 0; i < root->len; ++i) {
ast_node_t *node = root->children[i];
if (node->id != NODE_INSTRUCTION)
continue;
print_hex(node->value.encoding.len, node->value.encoding.encoding);
}
encoder_free(encoder);
ast_node_free(result.node);
return nullptr;
cleanup_ast:
ast_node_free(result.node);
return err;
}
int get_execution_mode(int argc, char *argv[]) {
if (argc != 3 || (strcmp(argv[1], "tokens") != 0 &&
strcmp(argv[1], "text") != 0 && strcmp(argv[1], "ast"))) {
puts("Usage: oas [tokens|text|ast] <filename>");
exit(1);
}
if (argc != 3)
return MODE_INVALID;
if (strcmp(argv[1], "tokens") == 0)
return MODE_TOKENS;
if (strcmp(argv[1], "text") == 0)
return MODE_TEXT;
if (strcmp(argv[1], "ast") == 0)
return MODE_AST;
if (strcmp(argv[1], "encoding") == 0)
return MODE_ENCODING;
return MODE_INVALID;
}
error_t *do_action(mode_t mode, tokenlist_t *list) {
switch (mode) {
case MODE_TOKENS:
print_tokens(list);
return nullptr;
case MODE_TEXT:
print_text(list);
return nullptr;
case MODE_AST:
return print_ast(list);
case MODE_ENCODING:
return print_encoding(list);
case MODE_INVALID:
/* can't happen */
}
__builtin_unreachable();
}
int main(int argc, char *argv[]) {
mode_t mode = get_execution_mode(argc, argv);
if (mode == MODE_INVALID) {
puts("Usage: oas [tokens|text|ast|encoding] <filename>");
exit(1);
}
char *filename = argv[2];
lexer_t *lex = &(lexer_t){};
@ -81,17 +155,9 @@ int main(int argc, char *argv[]) {
if (err)
goto cleanup_tokens;
switch (mode) {
case MODE_TOKENS:
print_tokens(list);
break;
case MODE_TEXT:
print_text(list);
break;
case MODE_AST:
print_ast(list);
break;
}
err = do_action(mode, list);
if (err)
goto cleanup_tokens;
tokenlist_free(list);
error_free(err);

View File

@ -83,7 +83,7 @@ parse_result_t parse_register_expression(tokenlist_entry_t *current) {
}
parse_result_t parse_immediate(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_number, parse_identifier, nullptr};
parser_t parsers[] = {parse_number, parse_label_reference, nullptr};
parse_result_t result = parse_any(current, parsers);
return parse_result_wrap(NODE_IMMEDIATE, result);
}
@ -119,8 +119,24 @@ parse_result_t parse_section_directive(tokenlist_entry_t *current) {
return parse_consecutive(current, NODE_SECTION_DIRECTIVE, parsers);
}
parse_result_t parse_import_directive(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_import, parse_identifier, nullptr};
return parse_consecutive(current, NODE_IMPORT_DIRECTIVE, parsers);
}
parse_result_t parse_export_directive(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_export, parse_identifier, nullptr};
return parse_consecutive(current, NODE_EXPORT_DIRECTIVE, parsers);
}
parse_result_t parse_directive_options(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_section_directive, parse_import_directive,
parse_export_directive, nullptr};
return parse_any(current, parsers);
}
parse_result_t parse_directive(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_dot, parse_section_directive, parse_newline,
parser_t parsers[] = {parse_dot, parse_directive_options, parse_newline,
nullptr};
return parse_consecutive(current, NODE_DIRECTIVE, parsers);
}

View File

@ -1,5 +1,6 @@
#include "primitives.h"
#include "../ast.h"
#include "../data/registers.h"
#include <string.h>
parse_result_t parse_identifier(tokenlist_entry_t *current) {
@ -71,23 +72,9 @@ parse_result_t parse_label_reference(tokenlist_entry_t *current) {
nullptr);
}
const char *registers[] = {
// 64-bit registers
"rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi", "r8", "r9", "r10",
"r11", "r12", "r13", "r14", "r15",
// 32-bit registers
"eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi", "r8d", "r9d",
"r10d", "r11d", "r12d", "r13d", "r14d", "r15d",
// 16-bit registers
"ax", "cx", "dx", "bx", "sp", "bp", "si", "di", "r8w", "r9w", "r10w",
"r11w", "r12w", "r13w", "r14w", "r15w",
// 8-bit low registers
"al", "cl", "dl", "bl", "spl", "bpl", "sil", "dil", "r8b", "r9b", "r10b",
"r11b", "r12b", "r13b", "r14b", "r15b", nullptr};
bool is_register_token(lexer_token_t *token) {
for (size_t i = 0; registers[i] != nullptr; ++i)
if (strcmp(token->value, registers[i]) == 0)
if (strcmp(token->value, registers[i]->name) == 0)
return true;
return false;
}
@ -105,3 +92,19 @@ parse_result_t parse_section(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_IDENTIFIER, NODE_SECTION,
is_section_token);
}
bool is_import_token(lexer_token_t *token) {
return strcmp(token->value, "import") == 0;
}
parse_result_t parse_import(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_IDENTIFIER, NODE_IMPORT, is_import_token);
}
bool is_export_token(lexer_token_t *token) {
return strcmp(token->value, "export") == 0;
}
parse_result_t parse_export(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_IDENTIFIER, NODE_EXPORT, is_export_token);
}

View File

@ -27,5 +27,7 @@ parse_result_t parse_label_reference(tokenlist_entry_t *current);
*/
parse_result_t parse_register(tokenlist_entry_t *current);
parse_result_t parse_section(tokenlist_entry_t *current);
parse_result_t parse_import(tokenlist_entry_t *current);
parse_result_t parse_export(tokenlist_entry_t *current);
#endif // INCLUDE_PARSER_PRIMITIVES_H_

164
tests/bytes.c Normal file
View File

@ -0,0 +1,164 @@
#include "../src/bytes.h"
#include "munit.h"
MunitResult test_bytes_initializer(const MunitParameter params[], void *data) {
(void)params;
(void)data;
bytes_t *bytes = LOCAL_BYTES(16);
munit_assert_size(bytes->len, ==, 0);
munit_assert_size(bytes->cap, ==, 16);
for (size_t i = 0; i < 16; ++i)
munit_assert_uint8(bytes->buffer[i], ==, 0);
return MUNIT_OK;
}
MunitResult test_bytes_append_uint8(const MunitParameter params[], void *data) {
(void)params;
(void)data;
bytes_t *bytes = LOCAL_BYTES(16);
munit_assert_size(bytes->len, ==, 0);
munit_assert_size(bytes->cap, ==, 16);
for (size_t i = 0; i < 16; ++i) {
error_t *err = bytes_append_uint8(bytes, (uint8_t)i);
munit_assert_null(err);
munit_assert_uint8(bytes->buffer[i], ==, (uint8_t)i);
}
error_t *err = bytes_append_uint8(bytes, 0xFF);
munit_assert_ptr(err, ==, err_bytes_no_capacity);
return MUNIT_OK;
}
MunitResult test_bytes_append_array(const MunitParameter params[], void *data) {
(void)params;
(void)data;
bytes_t *bytes = LOCAL_BYTES(16);
munit_assert_size(bytes->len, ==, 0);
munit_assert_size(bytes->cap, ==, 16);
uint8_t test_array[] = {0x01, 0x02, 0x03, 0x04, 0x05};
size_t array_len = sizeof(test_array) / sizeof(test_array[0]);
error_t *err = bytes_append_array(bytes, array_len, test_array);
munit_assert_null(err);
munit_assert_size(bytes->len, ==, array_len);
for (size_t i = 0; i < array_len; ++i) {
munit_assert_uint8(bytes->buffer[i], ==, test_array[i]);
}
uint8_t second_array[] = {0x06, 0x07, 0x08};
size_t second_len = sizeof(second_array) / sizeof(second_array[0]);
err = bytes_append_array(bytes, second_len, second_array);
munit_assert_null(err);
munit_assert_size(bytes->len, ==, array_len + second_len);
for (size_t i = 0; i < second_len; ++i) {
munit_assert_uint8(bytes->buffer[array_len + i], ==, second_array[i]);
}
uint8_t overflow_array[10] = {0}; // Array that would exceed capacity
err = bytes_append_array(bytes, sizeof(overflow_array), overflow_array);
munit_assert_ptr(err, ==, err_bytes_no_capacity);
munit_assert_size(bytes->len, ==, array_len + second_len);
return MUNIT_OK;
}
MunitResult test_bytes_append_bytes(const MunitParameter params[], void *data) {
(void)params;
(void)data;
bytes_t *src = LOCAL_BYTES(8);
bytes_t *dst = LOCAL_BYTES(16);
// Fill source bytes with test data
for (uint8_t i = 0; i < 5; ++i) {
error_t *err = bytes_append_uint8(src, i + 1);
munit_assert_null(err);
}
munit_assert_size(src->len, ==, 5);
// Append source to destination
error_t *err = bytes_append_bytes(dst, src);
munit_assert_null(err);
munit_assert_size(dst->len, ==, src->len);
// Verify destination contents match source
for (size_t i = 0; i < src->len; ++i) {
munit_assert_uint8(dst->buffer[i], ==, src->buffer[i]);
}
// Fill source with more data and append again
for (uint8_t i = 0; i < 3; ++i) {
err = bytes_append_uint8(src, i + 6);
munit_assert_null(err);
}
munit_assert_size(src->len, ==, 8);
// Append updated source
err = bytes_append_bytes(dst, src);
munit_assert_null(err);
munit_assert_size(dst->len, ==, 13); // 5 + 8
// Test capacity boundary
src->len = 4; // manually set length to barely not fit
err = bytes_append_bytes(dst, src);
munit_assert_ptr(err, ==, err_bytes_no_capacity);
munit_assert_size(dst->len, ==, 13); // Length unchanged after error
return MUNIT_OK;
}
MunitResult test_bytes_append_uint16(const MunitParameter params[], void *data) {
bytes_t *bytes = LOCAL_BYTES(16);
munit_assert_size(bytes->len, ==, 0);
munit_assert_size(bytes->cap, ==, 16);
bytes_append_uint16(bytes, 0xFFAA);
munit_assert_size(bytes->len, ==, 2);
munit_assert_uint8(bytes->buffer[0], ==, 0xAA);
munit_assert_uint8(bytes->buffer[1], ==, 0xFF);
return MUNIT_OK;
}
MunitResult test_bytes_append_uint32(const MunitParameter params[], void *data) {
bytes_t *bytes = LOCAL_BYTES(16);
munit_assert_size(bytes->len, ==, 0);
munit_assert_size(bytes->cap, ==, 16);
bytes_append_uint32(bytes, 0xAABBCCDD);
munit_assert_size(bytes->len, ==, 4);
munit_assert_uint8(bytes->buffer[0], ==, 0xDD);
munit_assert_uint8(bytes->buffer[1], ==, 0xCC);
munit_assert_uint8(bytes->buffer[2], ==, 0xBB);
munit_assert_uint8(bytes->buffer[3], ==, 0xAA);
return MUNIT_OK;
}
MunitResult test_bytes_append_uint64(const MunitParameter params[], void *data) {
bytes_t *bytes = LOCAL_BYTES(16);
munit_assert_size(bytes->len, ==, 0);
munit_assert_size(bytes->cap, ==, 16);
bytes_append_uint64(bytes, 0xAABBCCDDEEFF9988);
munit_assert_size(bytes->len, ==, 8);
munit_assert_uint8(bytes->buffer[0], ==, 0x88);
munit_assert_uint8(bytes->buffer[1], ==, 0x99);
munit_assert_uint8(bytes->buffer[2], ==, 0xFF);
munit_assert_uint8(bytes->buffer[3], ==, 0xEE);
munit_assert_uint8(bytes->buffer[4], ==, 0xDD);
munit_assert_uint8(bytes->buffer[5], ==, 0xCC);
munit_assert_uint8(bytes->buffer[6], ==, 0xBB);
munit_assert_uint8(bytes->buffer[7], ==, 0xAA);
return MUNIT_OK;
}
MunitTest bytes_tests[] = {
{"/initializer", test_bytes_initializer, nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr},
{"/append_uint8", test_bytes_append_uint8, nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr},
{"/append_array", test_bytes_append_array, nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr},
{"/append_bytes", test_bytes_append_bytes, nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr},
{"/append_uint16", test_bytes_append_uint16, nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr},
{"/append_uint32", test_bytes_append_uint32, nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr},
{"/append_uint64", test_bytes_append_uint64, nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr},
{nullptr, nullptr, nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr}
};

View File

@ -0,0 +1,65 @@
lbl_0: ; 65 symbols used for testing growing the symbols table
lbl_1:
lbl_2:
lbl_3:
lbl_4:
lbl_5:
lbl_6:
lbl_7:
lbl_8:
lbl_9:
lbl_10:
lbl_11:
lbl_12:
lbl_13:
lbl_14:
lbl_15:
lbl_16:
lbl_17:
lbl_18:
lbl_19:
lbl_20:
lbl_21:
lbl_22:
lbl_23:
lbl_24:
lbl_25:
lbl_26:
lbl_27:
lbl_28:
lbl_29:
lbl_30:
lbl_31:
lbl_32:
lbl_33:
lbl_34:
lbl_35:
lbl_36:
lbl_37:
lbl_38:
lbl_39:
lbl_40:
lbl_41:
lbl_42:
lbl_43:
lbl_44:
lbl_45:
lbl_46:
lbl_47:
lbl_48:
lbl_49:
lbl_50:
lbl_51:
lbl_52:
lbl_53:
lbl_54:
lbl_55:
lbl_56:
lbl_57:
lbl_58:
lbl_59:
lbl_60:
lbl_61:
lbl_62:
lbl_63:
lbl_64:

12
tests/input/symbols.asm Normal file
View File

@ -0,0 +1,12 @@
.import test
.export test
test:
call test
.import more
.export more
more:
call more
.import other
.export other
other:
call other

View File

@ -2,6 +2,9 @@
; Small valid code snippet that should contain all different AST nodes
.export _start
.import exit
_start:
mov eax, ebx
lea eax, [eax + ebx * 4 + 8]
@ -19,3 +22,5 @@ _start:
push 0xffff:64
push 0o777:16
push 0b0001:16
mov rax, 0
call exit

View File

@ -3,12 +3,16 @@
extern MunitTest ast_tests[];
extern MunitTest lexer_tests[];
extern MunitTest regression_tests[];
extern MunitTest symbols_tests[];
extern MunitTest bytes_tests[];
int main(int argc, char *argv[MUNIT_ARRAY_PARAM(argc + 1)]) {
MunitSuite suites[] = {
{"/regression", regression_tests, nullptr, 1, MUNIT_SUITE_OPTION_NONE},
{"/ast", ast_tests, nullptr, 1, MUNIT_SUITE_OPTION_NONE},
{"/lexer", lexer_tests, nullptr, 1, MUNIT_SUITE_OPTION_NONE},
{"/symbols", symbols_tests, nullptr, 1, MUNIT_SUITE_OPTION_NONE},
{"/bytes", bytes_tests, nullptr, 1, MUNIT_SUITE_OPTION_NONE},
{nullptr, nullptr, nullptr, 0, MUNIT_SUITE_OPTION_NONE},
};

351
tests/symbols.c Normal file
View File

@ -0,0 +1,351 @@
#include "../src/encoder/symbols.h"
#include "../src/ast.h"
#include "../src/error.h"
#include "../src/lexer.h"
#include "../src/parser/parser.h"
#include "munit.h"
#include <string.h>
void symbols_setup_test(ast_node_t **node, tokenlist_t **list, char *path) {
lexer_t *lex = &(lexer_t){};
lexer_open(lex, path);
tokenlist_alloc(list);
tokenlist_fill(*list, lex);
parse_result_t result = parse((*list)->head);
lexer_close(lex);
*node = result.node;
}
MunitResult test_symbol_table_alloc(const MunitParameter params[], void *data) {
(void)params;
(void)data;
symbol_table_t *table = nullptr;
error_t *err = symbol_table_alloc(&table);
munit_assert_ptr_not_null(table);
munit_assert_ptr_null(err);
munit_assert_size(table->cap, ==, 64); // Default capacity
munit_assert_size(table->len, ==, 0);
munit_assert_ptr_not_null(table->symbols);
symbol_table_free(table);
return MUNIT_OK;
}
MunitResult test_symbol_table_lookup_empty(const MunitParameter params[], void *data) {
(void)params;
(void)data;
symbol_table_t *table = nullptr;
symbol_table_alloc(&table);
symbol_t *symbol = symbol_table_lookup(table, "nonexistent");
munit_assert_ptr_null(symbol);
symbol_table_free(table);
return MUNIT_OK;
}
MunitResult test_symbol_add_reference(const MunitParameter params[], void *data) {
(void)params;
(void)data;
ast_node_t *root;
tokenlist_t *list;
symbol_table_t *table = nullptr;
symbols_setup_test(&root, &list, "tests/input/symbols.asm");
symbol_table_alloc(&table);
ast_node_t *reference = root->children[3]->children[1]->children[0]->children[0];
munit_assert_int(reference->id, ==, NODE_LABEL_REFERENCE);
munit_assert_size(table->len, ==, 0);
error_t *err = symbol_table_update(table, reference);
munit_assert_null(err);
munit_assert_size(table->len, ==, 1);
symbol_t *symbol = symbol_table_lookup(table, "test");
munit_assert_not_null(symbol);
munit_assert_int(SYMBOL_REFERENCE, ==, symbol->kind);
munit_assert_ptr_equal(reference, symbol->node);
munit_assert_string_equal(symbol->name, "test");
symbol_table_free(table);
ast_node_free(root);
tokenlist_free(list);
return MUNIT_OK;
}
MunitResult test_symbol_add_label(const MunitParameter params[], void *data) {
(void)params;
(void)data;
ast_node_t *root;
tokenlist_t *list;
symbol_table_t *table = nullptr;
symbols_setup_test(&root, &list, "tests/input/symbols.asm");
symbol_table_alloc(&table);
ast_node_t *label = root->children[2];
munit_assert_int(label->id, ==, NODE_LABEL);
munit_assert_size(table->len, ==, 0);
error_t *err = symbol_table_update(table, label);
munit_assert_null(err);
munit_assert_size(table->len, ==, 1);
symbol_t *symbol = symbol_table_lookup(table, "test");
munit_assert_not_null(symbol);
munit_assert_int(SYMBOL_LOCAL, ==, symbol->kind);
munit_assert_ptr_equal(label, symbol->node);
munit_assert_string_equal(symbol->name, "test");
symbol_table_free(table);
ast_node_free(root);
tokenlist_free(list);
return MUNIT_OK;
}
MunitResult test_symbol_add_import(const MunitParameter params[], void *data) {
(void)params;
(void)data;
ast_node_t *root;
tokenlist_t *list;
symbol_table_t *table = nullptr;
symbols_setup_test(&root, &list, "tests/input/symbols.asm");
symbol_table_alloc(&table);
ast_node_t *import_directive = root->children[0]->children[1];
munit_assert_int(import_directive->id, ==, NODE_IMPORT_DIRECTIVE);
munit_assert_size(table->len, ==, 0);
error_t *err = symbol_table_update(table, import_directive);
munit_assert_null(err);
munit_assert_size(table->len, ==, 1);
symbol_t *symbol = symbol_table_lookup(table, "test");
munit_assert_not_null(symbol);
munit_assert_int(SYMBOL_IMPORT, ==, symbol->kind);
munit_assert_ptr_equal(import_directive, symbol->node);
munit_assert_string_equal(symbol->name, "test");
symbol_table_free(table);
ast_node_free(root);
tokenlist_free(list);
return MUNIT_OK;
}
void test_symbol_update(const char *name, ast_node_t *first, symbol_kind_t first_kind, ast_node_t *second,
symbol_kind_t second_kind, bool should_succeed, bool should_update) {
symbol_table_t *table = nullptr;
symbol_table_alloc(&table);
munit_assert_size(table->len, ==, 0);
error_t *err = symbol_table_update(table, first);
munit_assert_null(err);
munit_assert_size(table->len, ==, 1);
symbol_t *symbol = symbol_table_lookup(table, name);
munit_assert_not_null(symbol);
munit_assert_int(first_kind, ==, symbol->kind);
munit_assert_ptr_equal(first, symbol->node);
munit_assert_string_equal(symbol->name, name);
err = symbol_table_update(table, second);
if (should_succeed)
munit_assert_null(err);
else
munit_assert_ptr_equal(err, err_symbol_table_incompatible_symbols);
munit_assert_size(table->len, ==, 1);
symbol = symbol_table_lookup(table, name);
if (should_update) {
munit_assert_not_null(symbol);
munit_assert_int(second_kind, ==, symbol->kind);
munit_assert_ptr_equal(second, symbol->node);
munit_assert_string_equal(symbol->name, name);
} else {
munit_assert_not_null(symbol);
munit_assert_int(first_kind, ==, symbol->kind);
munit_assert_ptr_equal(first, symbol->node);
munit_assert_string_equal(symbol->name, name);
}
symbol_table_free(table);
}
MunitResult test_symbol_upgrade_valid(const MunitParameter params[], void *data) {
ast_node_t *root;
tokenlist_t *list;
symbols_setup_test(&root, &list, "tests/input/symbols.asm");
ast_node_t *reference = root->children[3]->children[1]->children[0]->children[0];
ast_node_t *label = root->children[2];
ast_node_t *import_directive = root->children[0]->children[1];
ast_node_t *export_directive = root->children[1]->children[1];
// real upgrades
test_symbol_update("test", reference, SYMBOL_REFERENCE, label, SYMBOL_LOCAL, true, true);
test_symbol_update("test", reference, SYMBOL_REFERENCE, import_directive, SYMBOL_IMPORT, true, true);
test_symbol_update("test", reference, SYMBOL_REFERENCE, export_directive, SYMBOL_EXPORT, true, true);
test_symbol_update("test", label, SYMBOL_LOCAL, export_directive, SYMBOL_EXPORT, true, true);
// identity upgrades
test_symbol_update("test", reference, SYMBOL_REFERENCE, reference, SYMBOL_REFERENCE, true, false);
test_symbol_update("test", label, SYMBOL_LOCAL, label, SYMBOL_LOCAL, true, false);
test_symbol_update("test", import_directive, SYMBOL_IMPORT, import_directive, SYMBOL_IMPORT, true, false);
test_symbol_update("test", export_directive, SYMBOL_EXPORT, export_directive, SYMBOL_EXPORT, true, false);
// downgrades that are allowed and ignored
test_symbol_update("test", label, SYMBOL_LOCAL, reference, SYMBOL_REFERENCE, true, false);
test_symbol_update("test", import_directive, SYMBOL_IMPORT, reference, SYMBOL_REFERENCE, true, false);
test_symbol_update("test", export_directive, SYMBOL_EXPORT, reference, SYMBOL_REFERENCE, true, false);
test_symbol_update("test", export_directive, SYMBOL_EXPORT, label, SYMBOL_LOCAL, true, false);
test_symbol_update("test", import_directive, SYMBOL_IMPORT, label, SYMBOL_LOCAL, true, false);
ast_node_free(root);
tokenlist_free(list);
return MUNIT_OK;
}
MunitResult test_symbol_upgrade_invalid(const MunitParameter params[], void *data) {
ast_node_t *root;
tokenlist_t *list;
symbols_setup_test(&root, &list, "tests/input/symbols.asm");
ast_node_t *reference = root->children[3]->children[1]->children[0]->children[0];
ast_node_t *label = root->children[2];
ast_node_t *import_directive = root->children[0]->children[1];
ast_node_t *export_directive = root->children[1]->children[1];
// invalid upgrades
test_symbol_update("test", label, SYMBOL_LOCAL, import_directive, SYMBOL_IMPORT, false, false);
test_symbol_update("test", export_directive, SYMBOL_EXPORT, import_directive, SYMBOL_IMPORT, false, false);
test_symbol_update("test", import_directive, SYMBOL_IMPORT, export_directive, SYMBOL_EXPORT, false, false);
ast_node_free(root);
tokenlist_free(list);
return MUNIT_OK;
}
MunitResult test_symbol_add_export(const MunitParameter params[], void *data) {
(void)params;
(void)data;
ast_node_t *root;
tokenlist_t *list;
symbol_table_t *table = nullptr;
symbols_setup_test(&root, &list, "tests/input/symbols.asm");
symbol_table_alloc(&table);
ast_node_t *export_directive = root->children[1]->children[1];
munit_assert_int(export_directive->id, ==, NODE_EXPORT_DIRECTIVE);
munit_assert_size(table->len, ==, 0);
error_t *err = symbol_table_update(table, export_directive);
munit_assert_null(err);
munit_assert_size(table->len, ==, 1);
symbol_t *symbol = symbol_table_lookup(table, "test");
munit_assert_not_null(symbol);
munit_assert_int(SYMBOL_EXPORT, ==, symbol->kind);
munit_assert_ptr_equal(export_directive, symbol->node);
munit_assert_string_equal(symbol->name, "test");
symbol_table_free(table);
ast_node_free(root);
tokenlist_free(list);
return MUNIT_OK;
}
MunitResult test_symbol_table_growth(const MunitParameter params[], void *data) {
(void)params;
(void)data;
ast_node_t *root;
tokenlist_t *list;
symbol_table_t *table = nullptr;
// Set up with our manysymbols.asm file
symbols_setup_test(&root, &list, "tests/input/manysymbols.asm");
symbol_table_alloc(&table);
// Initial capacity should be the default (64)
munit_assert_size(table->cap, ==, 64);
munit_assert_size(table->len, ==, 0);
// Add the first 64 labels (indices 0-63)
size_t initial_cap = table->cap;
for (size_t i = 0; i < 64; i++) {
ast_node_t *label = root->children[i];
munit_assert_int(label->id, ==, NODE_LABEL);
error_t *err = symbol_table_update(table, label);
munit_assert_null(err);
munit_assert_size(table->len, ==, i + 1);
// Capacity should remain the same for the first 64 labels
munit_assert_size(table->cap, ==, initial_cap);
}
// Now add the 65th label (index 64), which should trigger growth
ast_node_t *final_label = root->children[64];
munit_assert_int(final_label->id, ==, NODE_LABEL);
error_t *err = symbol_table_update(table, final_label);
munit_assert_null(err);
munit_assert_size(table->len, ==, 65);
// Capacity should have doubled
munit_assert_size(table->cap, ==, initial_cap * 2);
// Validate we can look up all the symbols
for (size_t i = 0; i <= 64; i++) {
char name[10];
sprintf(name, "lbl_%zu", i);
symbol_t *symbol = symbol_table_lookup(table, name);
munit_assert_not_null(symbol);
munit_assert_int(SYMBOL_LOCAL, ==, symbol->kind);
munit_assert_string_equal(symbol->name, name);
}
symbol_table_free(table);
ast_node_free(root);
tokenlist_free(list);
return MUNIT_OK;
}
MunitResult test_symbol_invalid_node(const MunitParameter params[], void *data) {
(void)params;
(void)data;
ast_node_t *root;
tokenlist_t *list;
symbol_table_t *table = nullptr;
symbols_setup_test(&root, &list, "tests/input/symbols.asm");
symbol_table_alloc(&table);
munit_assert_size(table->len, ==, 0);
error_t *err = symbol_table_update(table, root);
munit_assert_ptr_equal(err, err_symbol_table_invalid_node);
munit_assert_size(table->len, ==, 0);
symbol_table_free(table);
ast_node_free(root);
tokenlist_free(list);
return MUNIT_OK;
}
MunitTest symbols_tests[] = {
{"/table_alloc", test_symbol_table_alloc, nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr},
{"/table_lookup_empty", test_symbol_table_lookup_empty, nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr},
{"/add_reference", test_symbol_add_reference, nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr},
{"/add_label", test_symbol_add_label, nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr},
{"/add_import", test_symbol_add_import, nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr},
{"/add_export", test_symbol_add_export, nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr},
{"/upgrade_valid", test_symbol_upgrade_valid, nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr},
{"/upgrade_invalid", test_symbol_upgrade_invalid, nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr},
{"/table_growth", test_symbol_table_growth, nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr},
{"/invalid_node", test_symbol_invalid_node, nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr},
{nullptr, nullptr, nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr}
};