Compare commits

..

15 Commits

Author SHA1 Message Date
6380bcc442 Slightly change the valid test input file
All checks were successful
Validate the build / validate-build (push) Successful in 25s
2025-04-02 12:04:42 +02:00
9de9005059 add functionality to main to parse and print the ast 2025-04-02 12:04:42 +02:00
4cb2bf7165 Fix parse_directive grammar rule 2025-04-02 12:04:42 +02:00
251b1b82b1 Add registers and fix section primitive parser 2025-04-02 12:04:42 +02:00
8faf73771b Fix parser loops in parse_any and parse_consecutive 2025-04-02 12:04:42 +02:00
4d8cbc066b Add more grammar rules to the parser 2025-04-02 12:04:42 +02:00
5968997d19 Add a parser combinator to parse a delimited list 2025-04-02 12:04:42 +02:00
eb3b9d6366 TODO: REVIEW ME AND WRITE PROPER MESSAGE
Fix lexer issue where consuming n tokens always fails if there are n
tokens and always succeeds if they aren't n tokens
2025-04-02 12:04:42 +02:00
ff4b01acea Use new validator function for parse_token calls on all primitives
Also adds new validated primitives for NODE_SECTION and NODE_REGISTER
2025-04-02 12:04:42 +02:00
9eec8e5e1a Fix incorrect error returned in parse_consecutive 2025-04-02 12:04:42 +02:00
234f614886 Add basic parser combinators 2025-04-02 12:04:42 +02:00
8ac844c2b0 Add "primitive" parsers for all the semantic tokens in the lexer grammar 2025-04-02 12:04:42 +02:00
56d1054b74 Fix parse_token to add the correct information to a parse node 2025-04-02 12:04:42 +02:00
d7dc6c802e Partial parser implementation 2025-04-02 12:04:42 +02:00
e2fa229c1d Add basic parser utilities 2025-04-02 12:04:36 +02:00
13 changed files with 84 additions and 265 deletions

View File

@ -1,55 +0,0 @@
# Linker file format
```C
struct object_file {
uint64_t magic; // ".oo-bin"
uint64_t version; // 1
uint64_t architecture; // AMD64(0)
uint64_t offsets_offset;
struct offsets {
uint64_t strings;
uint64_t sections;
uint64_t symbols;
uint64_t relocations;
} offsets;
struct string_table {
uint64_t size;
uint8_t data[static size];
} strings;
struct section_table {
uint32_t count;
struct section_entry {
uint32_t name;
uint64_t offset;
uint64_t size_on_disk;
uint64_t size_in_memory;
uint64_t flags;
} sections[static count];
} sections;
struct symbol_table {
uint32_t count;
struct symbol_entry {
uint32_t name;
uint8_t kind; // IMPORT(0) | EXPORT(1) | LOCAL(2)
uint32_t section;
uint64_t offset;
} symbols[static count];
} symbols;
struct relocation_table {
uint32_t count;
struct relocation_entry {
uint32_t section;
uint64_t offset;
uint8_t size;
uint32_t symbol;
uint8_t kind; // ABSOLUTE(0) | RELATIVE(1)
} relocations[static count];
} relocations;
};
```

View File

@ -1,6 +1,6 @@
#include "error.h" #include "error.h"
#include "lexer.h" #include "lexer.h"
#include "parser/parser.h" #include "parser.h"
#include "tokenlist.h" #include "tokenlist.h"
#include <limits.h> #include <limits.h>

53
src/parser.c Normal file
View File

@ -0,0 +1,53 @@
#include "parser.h"
#include "ast.h"
#include "lexer.h"
#include "parser_combinators.h"
#include "parser_primitives.h"
#include "parser_util.h"
#include "tokenlist.h"
parse_result_t parse_number(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_octal, parse_decimal, parse_hexadecimal,
parse_binary, nullptr};
return parse_any(current, parsers);
}
parse_result_t parse_operand(tokenlist_entry_t *current) {
// FIXME: not the correct set of parsers
parser_t parsers[] = {parse_register, parse_number, nullptr};
return parse_any(current, parsers);
}
parse_result_t parse_operands(tokenlist_entry_t *current) {
return parse_list(current, NODE_OPERANDS, true, TOKEN_COMMA, parse_operand);
}
parse_result_t parse_label(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_identifier, parse_colon, nullptr};
return parse_consecutive(current, NODE_LABEL, parsers);
}
parse_result_t parse_section_directive(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_section, parse_identifier, nullptr};
return parse_consecutive(current, NODE_SECTION_DIRECTIVE, parsers);
}
parse_result_t parse_directive(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_dot, parse_section_directive, nullptr};
return parse_consecutive(current, NODE_DIRECTIVE, parsers);
}
parse_result_t parse_instruction(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_identifier, parse_operands, nullptr};
return parse_consecutive(current, NODE_INSTRUCTION, parsers);
}
parse_result_t parse_statement(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_label, parse_directive, parse_instruction,
nullptr};
return parse_any(current, parsers);
}
parse_result_t parse(tokenlist_entry_t *current) {
return parse_many(current, NODE_PROGRAM, true, parse_statement);
}

11
src/parser.h Normal file
View File

@ -0,0 +1,11 @@
#ifndef INCLUDE_SRC_PARSER_H_
#define INCLUDE_SRC_PARSER_H_
#include "ast.h"
#include "error.h"
#include "parser_util.h"
#include "tokenlist.h"
parse_result_t parse(tokenlist_entry_t *current);
#endif // INCLUDE_SRC_PARSER_H_

View File

@ -1,140 +0,0 @@
#include "parser.h"
#include "../ast.h"
#include "../lexer.h"
#include "../tokenlist.h"
#include "combinators.h"
#include "primitives.h"
#include "util.h"
parse_result_t parse_number(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_octal, parse_decimal, parse_hexadecimal,
parse_binary, nullptr};
parse_result_t result = parse_any(current, parsers);
return parse_result_wrap(NODE_NUMBER, result);
}
parse_result_t parse_plus_or_minus(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_plus, parse_minus, nullptr};
return parse_any(current, parsers);
}
parse_result_t parse_register_index(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_plus, parse_register, parse_asterisk,
parse_number, nullptr};
return parse_consecutive(current, NODE_REGISTER_INDEX, parsers);
}
parse_result_t parse_register_offset(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_plus_or_minus, parse_number, nullptr};
return parse_consecutive(current, NODE_REGISTER_OFFSET, parsers);
}
parse_result_t parse_register_expression(tokenlist_entry_t *current) {
parse_result_t result;
ast_node_t *expr;
error_t *err = ast_node_alloc(&expr);
if (err)
return parse_error(err);
expr->id = NODE_REGISTER_EXPRESSION;
// <register>
result = parse_register(current);
if (result.err) {
ast_node_free(expr);
return result;
}
err = ast_node_add_child(expr, result.node);
if (err) {
ast_node_free(result.node);
ast_node_free(expr);
return parse_error(err);
}
current = result.next;
// <register_index>?
result = parse_register_index(current);
if (result.err) {
error_free(result.err);
} else {
err = ast_node_add_child(expr, result.node);
if (err) {
ast_node_free(result.node);
ast_node_free(expr);
return parse_error(err);
}
current = result.next;
}
// <register_offset>?
result = parse_register_offset(current);
if (result.err) {
error_free(result.err);
} else {
err = ast_node_add_child(expr, result.node);
if (err) {
ast_node_free(result.node);
ast_node_free(expr);
return parse_error(err);
}
current = result.next;
}
return parse_success(expr, current);
}
parse_result_t parse_immediate(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_number, parse_identifier, nullptr};
parse_result_t result = parse_any(current, parsers);
return parse_result_wrap(NODE_IMMEDIATE, result);
}
parse_result_t parse_memory_expression(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_register_expression, parse_identifier, nullptr};
return parse_any(current, parsers);
}
parse_result_t parse_memory(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_lbracket, parse_memory_expression,
parse_rbracket, nullptr};
return parse_consecutive(current, NODE_MEMORY, parsers);
}
parse_result_t parse_operand(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_register, parse_memory, parse_immediate,
nullptr};
return parse_any(current, parsers);
}
parse_result_t parse_operands(tokenlist_entry_t *current) {
return parse_list(current, NODE_OPERANDS, true, TOKEN_COMMA, parse_operand);
}
parse_result_t parse_label(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_identifier, parse_colon, nullptr};
return parse_consecutive(current, NODE_LABEL, parsers);
}
parse_result_t parse_section_directive(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_section, parse_identifier, nullptr};
return parse_consecutive(current, NODE_SECTION_DIRECTIVE, parsers);
}
parse_result_t parse_directive(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_dot, parse_section_directive, nullptr};
return parse_consecutive(current, NODE_DIRECTIVE, parsers);
}
parse_result_t parse_instruction(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_identifier, parse_operands, nullptr};
return parse_consecutive(current, NODE_INSTRUCTION, parsers);
}
parse_result_t parse_statement(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_label, parse_directive, parse_instruction,
nullptr};
return parse_any(current, parsers);
}
parse_result_t parse(tokenlist_entry_t *current) {
return parse_many(current, NODE_PROGRAM, true, parse_statement);
}

View File

@ -1,9 +0,0 @@
#ifndef INCLUDE_PARSER_PARSER_H_
#define INCLUDE_PARSER_PARSER_H_
#include "../tokenlist.h"
#include "util.h"
parse_result_t parse(tokenlist_entry_t *current);
#endif // INCLUDE_PARSER_PARSER_H_

View File

@ -1,4 +1,4 @@
#include "combinators.h" #include "parser_combinators.h"
// Parse a list of the given parser delimited by the given token id. Does not // Parse a list of the given parser delimited by the given token id. Does not
// store the delimiters in the parent node // store the delimiters in the parent node

View File

@ -1,7 +1,4 @@
#ifndef INCLUDE_PARSER_COMBINATORS_H_ #include "parser_util.h"
#define INCLUDE_PARSER_COMBINATORS_H_
#include "util.h"
typedef parse_result_t (*parser_t)(tokenlist_entry_t *); typedef parse_result_t (*parser_t)(tokenlist_entry_t *);
@ -21,5 +18,3 @@ parse_result_t parse_list(tokenlist_entry_t *current, node_id_t id,
// wraps the parsed nodes in a new parent node. // wraps the parsed nodes in a new parent node.
parse_result_t parse_consecutive(tokenlist_entry_t *current, node_id_t id, parse_result_t parse_consecutive(tokenlist_entry_t *current, node_id_t id,
parser_t parsers[]); parser_t parsers[]);
#endif // INCLUDE_PARSER_COMBINATORS_H_

View File

@ -1,5 +1,5 @@
#include "primitives.h" #include "parser_primitives.h"
#include "../ast.h" #include "ast.h"
#include <string.h> #include <string.h>
parse_result_t parse_identifier(tokenlist_entry_t *current) { parse_result_t parse_identifier(tokenlist_entry_t *current) {
@ -62,11 +62,6 @@ parse_result_t parse_dot(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_DOT, NODE_DOT, nullptr); return parse_token(current, TOKEN_DOT, NODE_DOT, nullptr);
} }
parse_result_t parse_label_reference(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_IDENTIFIER, NODE_LABEL_REFERENCE,
nullptr);
}
const char *registers[] = { const char *registers[] = {
// 64-bit registers // 64-bit registers
"rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi", "r8", "r9", "r10", "rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi", "r8", "r9", "r10",
@ -80,7 +75,6 @@ const char *registers[] = {
// 8-bit low registers // 8-bit low registers
"al", "cl", "dl", "bl", "spl", "bpl", "sil", "dil", "r8b", "r9b", "r10b", "al", "cl", "dl", "bl", "spl", "bpl", "sil", "dil", "r8b", "r9b", "r10b",
"r11b", "r12b", "r13b", "r14b", "r15b", nullptr}; "r11b", "r12b", "r13b", "r14b", "r15b", nullptr};
bool is_register_token(lexer_token_t *token) { bool is_register_token(lexer_token_t *token) {
for (size_t i = 0; registers[i] != nullptr; ++i) for (size_t i = 0; registers[i] != nullptr; ++i)
if (strcmp(token->value, registers[i]) == 0) if (strcmp(token->value, registers[i]) == 0)

View File

@ -1,7 +1,7 @@
#ifndef INCLUDE_PARSER_PRIMITIVES_H_ #ifndef INCLUDE_SRC_PARSER_PRIMITIVES_H_
#define INCLUDE_PARSER_PRIMITIVES_H_ #define INCLUDE_SRC_PARSER_PRIMITIVES_H_
#include "util.h" #include "parser_util.h"
parse_result_t parse_identifier(tokenlist_entry_t *current); parse_result_t parse_identifier(tokenlist_entry_t *current);
parse_result_t parse_decimal(tokenlist_entry_t *current); parse_result_t parse_decimal(tokenlist_entry_t *current);
@ -18,7 +18,6 @@ parse_result_t parse_plus(tokenlist_entry_t *current);
parse_result_t parse_minus(tokenlist_entry_t *current); parse_result_t parse_minus(tokenlist_entry_t *current);
parse_result_t parse_asterisk(tokenlist_entry_t *current); parse_result_t parse_asterisk(tokenlist_entry_t *current);
parse_result_t parse_dot(tokenlist_entry_t *current); parse_result_t parse_dot(tokenlist_entry_t *current);
parse_result_t parse_label_reference(tokenlist_entry_t *current);
/* These are "primitives" with a different name and some extra validation on top /* These are "primitives" with a different name and some extra validation on top
* for example, register is just an identifier but it only matches a limited set * for example, register is just an identifier but it only matches a limited set
@ -27,4 +26,4 @@ parse_result_t parse_label_reference(tokenlist_entry_t *current);
parse_result_t parse_register(tokenlist_entry_t *current); parse_result_t parse_register(tokenlist_entry_t *current);
parse_result_t parse_section(tokenlist_entry_t *current); parse_result_t parse_section(tokenlist_entry_t *current);
#endif // INCLUDE_PARSER_PRIMITIVES_H_ #endif // INCLUDE_SRC_PARSER_PRIMITIVES_H_

View File

@ -1,5 +1,5 @@
#include "util.h" #include "parser_util.h"
#include "../tokenlist.h" #include "tokenlist.h"
error_t *err_parse_no_match = error_t *err_parse_no_match =
&(error_t){.message = "parsing failed to find the correct token sequence"}; &(error_t){.message = "parsing failed to find the correct token sequence"};
@ -33,24 +33,3 @@ parse_result_t parse_token(tokenlist_entry_t *current,
return parse_success(node, current->next); return parse_success(node, current->next);
} }
parse_result_t parse_result_wrap(node_id_t id, parse_result_t result) {
if (result.err)
return result;
ast_node_t *node;
error_t *err = ast_node_alloc(&node);
if (err) {
ast_node_free(result.node);
return parse_error(err);
}
node->id = id;
err = ast_node_add_child(node, result.node);
if (err) {
ast_node_free(result.node);
return parse_error(err);
}
return parse_success(node, result.next);
}

View File

@ -1,9 +1,9 @@
#ifndef INCLUDE_PARSER_UTIL_H_ #ifndef INCLUDE_SRC_PARSER_UTIL_H_
#define INCLUDE_PARSER_UTIL_H_ #define INCLUDE_SRC_PARSER_UTIL_H_
#include "../ast.h" #include "ast.h"
#include "../error.h" #include "error.h"
#include "../tokenlist.h" #include "tokenlist.h"
typedef struct parse_result { typedef struct parse_result {
error_t *err; error_t *err;
@ -19,8 +19,9 @@ parse_result_t parse_success(ast_node_t *ast, tokenlist_entry_t *next);
parse_result_t parse_token(tokenlist_entry_t *current, parse_result_t parse_token(tokenlist_entry_t *current,
lexer_token_id_t token_id, node_id_t ast_id, lexer_token_id_t token_id, node_id_t ast_id,
token_validator_t is_valid); token_validator_t is_valid);
parse_result_t parse_result_wrap(node_id_t id, parse_result_t result);
tokenlist_entry_t *skip_insignificant(tokenlist_entry_t *);
extern error_t *err_parse_no_match; extern error_t *err_parse_no_match;
#endif // INCLUDE_PARSER_UTIL_H_ #endif // INCLUDE_SRC_PARSER_UTIL_H_

View File

@ -1,17 +1,8 @@
.section text .section text
; Small valid code snippet that should contain all different AST nodes
_start: _start:
mov eax, ebx mov eax, ebx
lea eax, [eax + ebx * 4 + 8] mov eax, 555 ; move 555 into eax
lea eax, [eax + 8]
lea eax, [eax + ebx * 8]
lea eax, [esp - 24]
lea eax, [eax + ebx * 4 - 8]
lea eax, [_start]
mov eax, _start
mov eax, 555
push 0o777 push 0o777
xor eax, 0xDEADBEEF xor eax, 0xDEADBEEF
and ecx, 0o770 and ecx, 0o770