Compare commits

...

4 Commits

Author SHA1 Message Date
7b2cee0533 Add first encoding pass
All checks were successful
Validate the build / validate-build (push) Successful in 42s
First pass collects all the symbols and interprets number and register
tokens into usable data for the later passes.
2025-04-15 00:05:14 +02:00
3a164de8d4 Add register and number values to AST nodes 2025-04-15 00:05:14 +02:00
32ca7b942c Add initial limited opcode data 2025-04-15 00:05:14 +02:00
43ea0042b6 Add registers data table
Change the validated primitive parse_register so that it uses the data
table instead
2025-04-15 00:05:06 +02:00
8 changed files with 542 additions and 20 deletions

View File

@ -1,6 +1,7 @@
#ifndef INCLUDE_SRC_AST_H_ #ifndef INCLUDE_SRC_AST_H_
#define INCLUDE_SRC_AST_H_ #define INCLUDE_SRC_AST_H_
#include "data/registers.h"
#include "error.h" #include "error.h"
#include "lexer.h" #include "lexer.h"
#include "tokenlist.h" #include "tokenlist.h"
@ -62,6 +63,16 @@ constexpr size_t node_default_children_cap = 8;
/* 65K ought to be enough for anybody */ /* 65K ought to be enough for anybody */
constexpr size_t node_max_children_cap = 1 << 16; constexpr size_t node_max_children_cap = 1 << 16;
typedef struct number {
uint64_t value;
operand_size_t size;
} number_t;
typedef struct register_ {
register_id_t id;
operand_size_t size;
} register_t;
struct ast_node { struct ast_node {
node_id_t id; node_id_t id;
tokenlist_entry_t *token_entry; tokenlist_entry_t *token_entry;
@ -70,11 +81,8 @@ struct ast_node {
ast_node_t **children; ast_node_t **children;
union { union {
struct { register_t reg;
uint64_t value; number_t number;
int size;
} integer;
char *name;
} value; } value;
}; };

68
src/data/opcodes.c Normal file
View File

@ -0,0 +1,68 @@
#include "opcodes.h"
// clang-format off
opcode_data_t *const opcodes[] = {
// RET
&(opcode_data_t) {
.mnemonic = "ret",
.opcode = 0xC3,
.opcode_extension = opcode_extension_none,
.operand_count = 0,
},
// RET imm16
&(opcode_data_t) {
.mnemonic = "ret",
.opcode = 0xC2,
.opcode_extension = opcode_extension_none,
.operand_count = 1,
.operands = {
{ .kind = OPERAND_IMMEDIATE, .size = OPERAND_SIZE_16 },
},
},
// PUSH imm8
&(opcode_data_t) {
.mnemonic = "push",
.opcode = 0x6A,
.opcode_extension = opcode_extension_none,
.operand_count = 1,
.operands = {
{ .kind = OPERAND_IMMEDIATE, .size = OPERAND_SIZE_8},
},
},
// PUSH imm16
&(opcode_data_t) {
.mnemonic = "push",
.opcode = 0x68,
.opcode_extension = opcode_extension_none,
.operand_size_prefix = true,
.operand_count = 1,
.operands = {
{ .kind = OPERAND_IMMEDIATE, .size = OPERAND_SIZE_16},
},
},
// PUSH imm32
&(opcode_data_t) {
.mnemonic = "push",
.opcode = 0x68,
.opcode_extension = opcode_extension_none,
.operand_size_prefix = false,
.operand_count = 1,
.operands = {
{ .kind = OPERAND_IMMEDIATE, .size = OPERAND_SIZE_32},
},
},
// Push reg16,
&(opcode_data_t) {
.mnemonic = "push",
.opcode = 0x50,
.opcode_extension = opcode_extension_none,
.encoding_class = ENCODING_OPCODE_REGISTER,
.operand_count = 1,
.operands = {
{ .kind = OPERAND_REGISTER, .size = OPERAND_SIZE_16 | OPERAND_SIZE_32 | OPERAND_SIZE_64 },
},
},
nullptr,
};

56
src/data/opcodes.h Normal file
View File

@ -0,0 +1,56 @@
#ifndef INCLUDE_DATA_OPCODES_H_
#define INCLUDE_DATA_OPCODES_H_
#include "../data/registers.h"
#include <stddef.h>
#include <stdint.h>
constexpr uint8_t rex_prefix = 0x40;
constexpr uint8_t rex_prefix_w = 0x48;
constexpr uint8_t rex_prefix_r = 0x44;
constexpr uint8_t rex_prefix_x = 0x42;
constexpr uint8_t rex_prefix_b = 0x41;
constexpr uint8_t operand_size_prefix = 0x66;
constexpr uint8_t memory_size_prefix = 0x67;
constexpr uint8_t lock_prefix = 0xF0;
constexpr uint8_t repne_prefix = 0xF2;
constexpr uint8_t rep_prefix = 0xF3;
typedef enum encoding_class {
ENCODING_DEFAULT, // use modrm+sib for registers and memory, append
// immediates
ENCODING_OPCODE_REGISTER, // encode the register in the last 3 bits of the
// opcode
} encoding_class_t;
typedef enum operand_kind {
OPERAND_REGISTER,
OPERAND_MEMORY,
OPERAND_IMMEDIATE,
} operand_kind_t;
typedef struct operand_info {
operand_kind_t kind;
operand_size_t size;
} operand_info_t;
constexpr uint8_t opcode_extension_none = 0xFF;
typedef struct opcode_data {
const char *mnemonic;
uint16_t opcode;
uint8_t opcode_extension; // 3 bits for the opcode extension in the reg
// field of a modr/m byte
encoding_class_t encoding_class;
bool operand_size_prefix;
bool address_size_prefix;
bool rex_w_prefix;
size_t operand_count;
operand_info_t operands[3];
} opcode_data_t;
extern opcode_data_t *const opcodes[];
#endif // INCLUDE_DATA_OPCODES_H_

92
src/data/registers.c Normal file
View File

@ -0,0 +1,92 @@
#include "registers.h"
register_data_t *const registers[] = {
// Instruction pointer
&(register_data_t){"rip", REG_RIP, OPERAND_SIZE_64},
&(register_data_t){"eip", REG_RIP, OPERAND_SIZE_32},
&(register_data_t){"ip", REG_RIP, OPERAND_SIZE_16},
// 64-bit general purpose registers
&(register_data_t){"rax", REG_A, OPERAND_SIZE_64},
&(register_data_t){"rcx", REG_C, OPERAND_SIZE_64},
&(register_data_t){"rdx", REG_D, OPERAND_SIZE_64},
&(register_data_t){"rbx", REG_B, OPERAND_SIZE_64},
&(register_data_t){"rsp", REG_SP, OPERAND_SIZE_64},
&(register_data_t){"rbp", REG_BP, OPERAND_SIZE_64},
&(register_data_t){"rsi", REG_SI, OPERAND_SIZE_64},
&(register_data_t){"rdi", REG_DI, OPERAND_SIZE_64},
&(register_data_t){"r8", REG_8, OPERAND_SIZE_64},
&(register_data_t){"r9", REG_9, OPERAND_SIZE_64},
&(register_data_t){"r10", REG_10, OPERAND_SIZE_64},
&(register_data_t){"r11", REG_11, OPERAND_SIZE_64},
&(register_data_t){"r12", REG_12, OPERAND_SIZE_64},
&(register_data_t){"r13", REG_13, OPERAND_SIZE_64},
&(register_data_t){"r14", REG_14, OPERAND_SIZE_64},
&(register_data_t){"r15", REG_15, OPERAND_SIZE_64},
// 32-bit general purpose registers
&(register_data_t){"eax", REG_A, OPERAND_SIZE_32},
&(register_data_t){"ecx", REG_C, OPERAND_SIZE_32},
&(register_data_t){"edx", REG_D, OPERAND_SIZE_32},
&(register_data_t){"ebx", REG_B, OPERAND_SIZE_32},
&(register_data_t){"esp", REG_SP, OPERAND_SIZE_32},
&(register_data_t){"ebp", REG_BP, OPERAND_SIZE_32},
&(register_data_t){"esi", REG_SI, OPERAND_SIZE_32},
&(register_data_t){"edi", REG_DI, OPERAND_SIZE_32},
&(register_data_t){"r8d", REG_8, OPERAND_SIZE_32},
&(register_data_t){"r9d", REG_9, OPERAND_SIZE_32},
&(register_data_t){"r10d", REG_10, OPERAND_SIZE_32},
&(register_data_t){"r11d", REG_11, OPERAND_SIZE_32},
&(register_data_t){"r12d", REG_12, OPERAND_SIZE_32},
&(register_data_t){"r13d", REG_13, OPERAND_SIZE_32},
&(register_data_t){"r14d", REG_14, OPERAND_SIZE_32},
&(register_data_t){"r15d", REG_15, OPERAND_SIZE_32},
// 16-bit general purpose registers
&(register_data_t){"ax", REG_A, OPERAND_SIZE_16},
&(register_data_t){"cx", REG_C, OPERAND_SIZE_16},
&(register_data_t){"dx", REG_D, OPERAND_SIZE_16},
&(register_data_t){"bx", REG_B, OPERAND_SIZE_16},
&(register_data_t){"sp", REG_SP, OPERAND_SIZE_16},
&(register_data_t){"bp", REG_BP, OPERAND_SIZE_16},
&(register_data_t){"si", REG_SI, OPERAND_SIZE_16},
&(register_data_t){"di", REG_DI, OPERAND_SIZE_16},
&(register_data_t){"r8w", REG_8, OPERAND_SIZE_16},
&(register_data_t){"r9w", REG_9, OPERAND_SIZE_16},
&(register_data_t){"r10w", REG_10, OPERAND_SIZE_16},
&(register_data_t){"r11w", REG_11, OPERAND_SIZE_16},
&(register_data_t){"r12w", REG_12, OPERAND_SIZE_16},
&(register_data_t){"r13w", REG_13, OPERAND_SIZE_16},
&(register_data_t){"r14w", REG_14, OPERAND_SIZE_16},
&(register_data_t){"r15w", REG_15, OPERAND_SIZE_16},
// 8-bit general purpose registers (low byte)
&(register_data_t){"al", REG_A, OPERAND_SIZE_8 },
&(register_data_t){"cl", REG_C, OPERAND_SIZE_8 },
&(register_data_t){"dl", REG_D, OPERAND_SIZE_8 },
&(register_data_t){"bl", REG_B, OPERAND_SIZE_8 },
&(register_data_t){"spl", REG_SP, OPERAND_SIZE_8 },
&(register_data_t){"bpl", REG_BP, OPERAND_SIZE_8 },
&(register_data_t){"sil", REG_SI, OPERAND_SIZE_8 },
&(register_data_t){"dil", REG_DI, OPERAND_SIZE_8 },
&(register_data_t){"r8b", REG_8, OPERAND_SIZE_8 },
&(register_data_t){"r9b", REG_9, OPERAND_SIZE_8 },
&(register_data_t){"r10b", REG_10, OPERAND_SIZE_8 },
&(register_data_t){"r11b", REG_11, OPERAND_SIZE_8 },
&(register_data_t){"r12b", REG_12, OPERAND_SIZE_8 },
&(register_data_t){"r13b", REG_13, OPERAND_SIZE_8 },
&(register_data_t){"r14b", REG_14, OPERAND_SIZE_8 },
&(register_data_t){"r15b", REG_15, OPERAND_SIZE_8 },
// x87 floating point registers
&(register_data_t){"st0", REG_ST0, OPERAND_SIZE_80},
&(register_data_t){"st1", REG_ST1, OPERAND_SIZE_80},
&(register_data_t){"st2", REG_ST2, OPERAND_SIZE_80},
&(register_data_t){"st3", REG_ST3, OPERAND_SIZE_80},
&(register_data_t){"st4", REG_ST4, OPERAND_SIZE_80},
&(register_data_t){"st5", REG_ST5, OPERAND_SIZE_80},
&(register_data_t){"st6", REG_ST6, OPERAND_SIZE_80},
&(register_data_t){"st7", REG_ST7, OPERAND_SIZE_80},
nullptr,
};

82
src/data/registers.h Normal file
View File

@ -0,0 +1,82 @@
#ifndef INCLUDE_DATA_REGISTERS_H_
#define INCLUDE_DATA_REGISTERS_H_
typedef enum operand_size {
OPERAND_SIZE_INVALID = 0,
OPERAND_SIZE_8 = 1 << 0,
OPERAND_SIZE_16 = 1 << 1,
OPERAND_SIZE_32 = 1 << 2,
OPERAND_SIZE_64 = 1 << 3,
OPERAND_SIZE_80 = 1 << 4,
OPERAND_SIZE_128 = 1 << 5,
OPERAND_SIZE_256 = 1 << 6,
OPERAND_SIZE_512 = 1 << 7,
} operand_size_t;
static inline operand_size_t bits_to_operand_size(int bits) {
switch (bits) {
case 8:
return OPERAND_SIZE_8;
case 16:
return OPERAND_SIZE_16;
case 32:
return OPERAND_SIZE_32;
case 64:
return OPERAND_SIZE_64;
case 80:
return OPERAND_SIZE_80;
case 128:
return OPERAND_SIZE_128;
case 256:
return OPERAND_SIZE_256;
case 512:
return OPERAND_SIZE_512;
default:
return OPERAND_SIZE_INVALID;
}
}
typedef enum register_id {
// Special registers
REG_RIP = -1,
// General purpose registers
REG_A = 0x0000,
REG_C,
REG_D,
REG_B,
REG_SP,
REG_BP,
REG_SI,
REG_DI,
REG_8,
REG_9,
REG_10,
REG_11,
REG_12,
REG_13,
REG_14,
REG_15,
REG_ST0 = 0x1000,
REG_ST1,
REG_ST2,
REG_ST3,
REG_ST4,
REG_ST5,
REG_ST6,
REG_ST7,
} register_id_t;
typedef struct register_data {
const char *name;
register_id_t id;
operand_size_t size;
} register_data_t;
extern register_data_t *const registers[];
#endif // INCLUDE_DATA_REGISTERS_H_

209
src/encoder/encoder.c Normal file
View File

@ -0,0 +1,209 @@
#include "encoder.h"
#include "../data/opcodes.h"
#include "symbols.h"
#include <assert.h>
#include <errno.h>
#include <string.h>
error_t *const err_encoder_invalid_register =
&(error_t){.message = "Invalid register"};
error_t *const err_encoder_number_overflow =
&(error_t){.message = "Number overflows the storage"};
error_t *const err_encoder_invalid_number_format =
&(error_t){.message = "Invalid number format"};
error_t *const err_encoder_invalid_size_suffix =
&(error_t){.message = "Invalid number size suffix"};
error_t *const err_encoder_unknown_symbol_reference =
&(error_t){.message = "Referenced an unknown symbol"};
error_t *encoder_alloc(encoder_t **output) {
*output = nullptr;
encoder_t *encoder = calloc(1, sizeof(encoder_t));
if (encoder == nullptr)
return err_allocation_failed;
error_t *err = symbol_table_alloc(&encoder->symbols);
if (err) {
free(encoder);
return err;
}
*output = encoder;
return nullptr;
}
void encoder_free(encoder_t *encoder) {
if (encoder == nullptr)
return;
symbol_table_free(encoder->symbols);
free(encoder);
}
bool encoder_is_symbols_node(ast_node_t *node) {
switch (node->id) {
case NODE_LABEL:
case NODE_LABEL_REFERENCE:
case NODE_EXPORT_DIRECTIVE:
case NODE_IMPORT_DIRECTIVE:
return true;
default:
return false;
}
}
int encoder_get_number_base(ast_node_t *number) {
switch (number->children[0]->id) {
case NODE_BINARY:
return 2;
case NODE_OCTAL:
return 8;
case NODE_DECIMAL:
return 10;
case NODE_HEXADECIMAL:
return 16;
default:
assert(false);
}
__builtin_unreachable();
}
bool is_valid_size_suffix(int bits) {
switch (bits) {
case 0:
case 8:
case 16:
case 32:
case 64:
return true;
default:
return false;
}
}
bool is_overflow(uint64_t value, int bits) {
if (bits == 0 || bits >= 64)
return false;
uint64_t max_value = (1ULL << bits) - 1;
return value > max_value;
}
operand_size_t encoder_get_size_mask(uint64_t value, int bits) {
if (bits != 0)
return bits_to_operand_size(bits);
operand_size_t mask = OPERAND_SIZE_64;
if (value < (1ULL << 8))
mask |= OPERAND_SIZE_8;
if (value < (1ULL << 16))
mask |= OPERAND_SIZE_16;
if (value < (1ULL << 32))
mask |= OPERAND_SIZE_32;
return mask;
}
error_t *encoder_set_number_value(ast_node_t *node) {
assert(node->id == NODE_NUMBER);
assert(node->children[0]);
const char *number = node->children[0]->token_entry->token.value;
int base = encoder_get_number_base(node);
if (base != 10)
number += 2; // all except base 10 use a 0x, 0o or 0b prefix
char *endptr;
errno = 0;
uint64_t value = strtoull(number, &endptr, base);
if (errno == ERANGE)
return err_encoder_number_overflow;
if (endptr == number)
return err_encoder_invalid_number_format;
int bits = 0;
if (*endptr == ':') {
const char *suffix = endptr + 1;
bits = strtol(suffix, &endptr, 10);
if (endptr == suffix)
return err_encoder_invalid_number_format;
}
if (*endptr != '\0')
return err_encoder_invalid_number_format;
if (!is_valid_size_suffix(bits))
return err_encoder_invalid_size_suffix;
if (is_overflow(value, bits))
return err_encoder_number_overflow;
node->value.number.value = value;
node->value.number.size = encoder_get_size_mask(value, bits);
return nullptr;
}
error_t *encoder_set_register_value(ast_node_t *node) {
assert(node->id == NODE_REGISTER);
const char *value = node->token_entry->token.value;
for (size_t i = 0; registers[i] != nullptr; ++i) {
if (strcmp(value, registers[i]->name) == 0) {
node->value.reg.id = registers[i]->id;
node->value.reg.size = registers[i]->size;
}
}
}
/**
* Perform the initial pass over the AST. Records all symbols and sets the
* values of registers and numbers.
*/
error_t *encoder_first_pass(encoder_t *encoder, ast_node_t *node) {
error_t *err = nullptr;
if (encoder_is_symbols_node(node))
err = symbol_table_update(encoder->symbols, node);
else if (node->id == NODE_NUMBER)
err = encoder_set_number_value(node);
else if (node->id == NODE_REGISTER)
err = encoder_set_register_value(node);
if (err)
return err;
for (size_t i = 0; i < node->len; ++i) {
error_t *err = encoder_first_pass(encoder, node->children[i]);
if (err)
return err;
}
return nullptr;
}
opcode_data_t *encoder_find_opcode(ast_node_t *instruction) {
for (size_t i = 0; opcodes[i] != nullptr; ++i) {
}
return nullptr;
}
error_t *encoder_check_symbols(encoder_t *encoder) {
for (size_t i = 0; i < encoder->symbols->len; ++i)
if (encoder->symbols->symbols[i].kind == SYMBOL_REFERENCE)
return err_encoder_unknown_symbol_reference;
return nullptr;
}
error_t *encoder_encode(encoder_t *encoder, ast_node_t *ast) {
error_t *err = encoder_first_pass(encoder, ast);
if (err)
return err;
err = encoder_check_symbols(encoder);
if (err)
return err;
return nullptr;
}

20
src/encoder/encoder.h Normal file
View File

@ -0,0 +1,20 @@
#ifndef INCLUDE_ENCODER_ENCODER_H_
#define INCLUDE_ENCODER_ENCODER_H_
#include "symbols.h"
typedef struct encoder {
symbol_table_t *symbols;
} encoder_t;
error_t *encoder_alloc(encoder_t **encoder);
error_t *encoder_encode(encoder_t *encoder, ast_node_t *ast);
void encoder_free(encoder_t *encoder);
extern error_t *const err_encoder_invalid_register;
extern error_t *const err_encoder_number_overflow;
extern error_t *const err_encoder_invalid_number_format;
extern error_t *const err_encoder_invalid_size_suffix;
extern error_t *const err_encoder_unknown_symbol_reference;
#endif // INCLUDE_ENCODER_ENCODER_H_

View File

@ -1,5 +1,6 @@
#include "primitives.h" #include "primitives.h"
#include "../ast.h" #include "../ast.h"
#include "../data/registers.h"
#include <string.h> #include <string.h>
parse_result_t parse_identifier(tokenlist_entry_t *current) { parse_result_t parse_identifier(tokenlist_entry_t *current) {
@ -67,23 +68,9 @@ parse_result_t parse_label_reference(tokenlist_entry_t *current) {
nullptr); nullptr);
} }
const char *registers[] = {
// 64-bit registers
"rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi", "r8", "r9", "r10",
"r11", "r12", "r13", "r14", "r15",
// 32-bit registers
"eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi", "r8d", "r9d",
"r10d", "r11d", "r12d", "r13d", "r14d", "r15d",
// 16-bit registers
"ax", "cx", "dx", "bx", "sp", "bp", "si", "di", "r8w", "r9w", "r10w",
"r11w", "r12w", "r13w", "r14w", "r15w",
// 8-bit low registers
"al", "cl", "dl", "bl", "spl", "bpl", "sil", "dil", "r8b", "r9b", "r10b",
"r11b", "r12b", "r13b", "r14b", "r15b", nullptr};
bool is_register_token(lexer_token_t *token) { bool is_register_token(lexer_token_t *token) {
for (size_t i = 0; registers[i] != nullptr; ++i) for (size_t i = 0; registers[i] != nullptr; ++i)
if (strcmp(token->value, registers[i]) == 0) if (strcmp(token->value, registers[i]->name) == 0)
return true; return true;
return false; return false;
} }