Compare commits

..

15 Commits

Author SHA1 Message Date
6380bcc442 Slightly change the valid test input file
All checks were successful
Validate the build / validate-build (push) Successful in 25s
2025-04-02 12:04:42 +02:00
9de9005059 add functionality to main to parse and print the ast 2025-04-02 12:04:42 +02:00
4cb2bf7165 Fix parse_directive grammar rule 2025-04-02 12:04:42 +02:00
251b1b82b1 Add registers and fix section primitive parser 2025-04-02 12:04:42 +02:00
8faf73771b Fix parser loops in parse_any and parse_consecutive 2025-04-02 12:04:42 +02:00
4d8cbc066b Add more grammar rules to the parser 2025-04-02 12:04:42 +02:00
5968997d19 Add a parser combinator to parse a delimited list 2025-04-02 12:04:42 +02:00
eb3b9d6366 TODO: REVIEW ME AND WRITE PROPER MESSAGE
Fix lexer issue where consuming n tokens always fails if there are n
tokens and always succeeds if they aren't n tokens
2025-04-02 12:04:42 +02:00
ff4b01acea Use new validator function for parse_token calls on all primitives
Also adds new validated primitives for NODE_SECTION and NODE_REGISTER
2025-04-02 12:04:42 +02:00
9eec8e5e1a Fix incorrect error returned in parse_consecutive 2025-04-02 12:04:42 +02:00
234f614886 Add basic parser combinators 2025-04-02 12:04:42 +02:00
8ac844c2b0 Add "primitive" parsers for all the semantic tokens in the lexer grammar 2025-04-02 12:04:42 +02:00
56d1054b74 Fix parse_token to add the correct information to a parse node 2025-04-02 12:04:42 +02:00
d7dc6c802e Partial parser implementation 2025-04-02 12:04:42 +02:00
e2fa229c1d Add basic parser utilities 2025-04-02 12:04:36 +02:00
13 changed files with 84 additions and 265 deletions

View File

@ -1,55 +0,0 @@
# Linker file format
```C
struct object_file {
uint64_t magic; // ".oo-bin"
uint64_t version; // 1
uint64_t architecture; // AMD64(0)
uint64_t offsets_offset;
struct offsets {
uint64_t strings;
uint64_t sections;
uint64_t symbols;
uint64_t relocations;
} offsets;
struct string_table {
uint64_t size;
uint8_t data[static size];
} strings;
struct section_table {
uint32_t count;
struct section_entry {
uint32_t name;
uint64_t offset;
uint64_t size_on_disk;
uint64_t size_in_memory;
uint64_t flags;
} sections[static count];
} sections;
struct symbol_table {
uint32_t count;
struct symbol_entry {
uint32_t name;
uint8_t kind; // IMPORT(0) | EXPORT(1) | LOCAL(2)
uint32_t section;
uint64_t offset;
} symbols[static count];
} symbols;
struct relocation_table {
uint32_t count;
struct relocation_entry {
uint32_t section;
uint64_t offset;
uint8_t size;
uint32_t symbol;
uint8_t kind; // ABSOLUTE(0) | RELATIVE(1)
} relocations[static count];
} relocations;
};
```

View File

@ -1,6 +1,6 @@
#include "error.h"
#include "lexer.h"
#include "parser/parser.h"
#include "parser.h"
#include "tokenlist.h"
#include <limits.h>

53
src/parser.c Normal file
View File

@ -0,0 +1,53 @@
#include "parser.h"
#include "ast.h"
#include "lexer.h"
#include "parser_combinators.h"
#include "parser_primitives.h"
#include "parser_util.h"
#include "tokenlist.h"
parse_result_t parse_number(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_octal, parse_decimal, parse_hexadecimal,
parse_binary, nullptr};
return parse_any(current, parsers);
}
parse_result_t parse_operand(tokenlist_entry_t *current) {
// FIXME: not the correct set of parsers
parser_t parsers[] = {parse_register, parse_number, nullptr};
return parse_any(current, parsers);
}
parse_result_t parse_operands(tokenlist_entry_t *current) {
return parse_list(current, NODE_OPERANDS, true, TOKEN_COMMA, parse_operand);
}
parse_result_t parse_label(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_identifier, parse_colon, nullptr};
return parse_consecutive(current, NODE_LABEL, parsers);
}
parse_result_t parse_section_directive(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_section, parse_identifier, nullptr};
return parse_consecutive(current, NODE_SECTION_DIRECTIVE, parsers);
}
parse_result_t parse_directive(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_dot, parse_section_directive, nullptr};
return parse_consecutive(current, NODE_DIRECTIVE, parsers);
}
parse_result_t parse_instruction(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_identifier, parse_operands, nullptr};
return parse_consecutive(current, NODE_INSTRUCTION, parsers);
}
parse_result_t parse_statement(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_label, parse_directive, parse_instruction,
nullptr};
return parse_any(current, parsers);
}
parse_result_t parse(tokenlist_entry_t *current) {
return parse_many(current, NODE_PROGRAM, true, parse_statement);
}

11
src/parser.h Normal file
View File

@ -0,0 +1,11 @@
#ifndef INCLUDE_SRC_PARSER_H_
#define INCLUDE_SRC_PARSER_H_
#include "ast.h"
#include "error.h"
#include "parser_util.h"
#include "tokenlist.h"
parse_result_t parse(tokenlist_entry_t *current);
#endif // INCLUDE_SRC_PARSER_H_

View File

@ -1,140 +0,0 @@
#include "parser.h"
#include "../ast.h"
#include "../lexer.h"
#include "../tokenlist.h"
#include "combinators.h"
#include "primitives.h"
#include "util.h"
parse_result_t parse_number(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_octal, parse_decimal, parse_hexadecimal,
parse_binary, nullptr};
parse_result_t result = parse_any(current, parsers);
return parse_result_wrap(NODE_NUMBER, result);
}
parse_result_t parse_plus_or_minus(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_plus, parse_minus, nullptr};
return parse_any(current, parsers);
}
parse_result_t parse_register_index(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_plus, parse_register, parse_asterisk,
parse_number, nullptr};
return parse_consecutive(current, NODE_REGISTER_INDEX, parsers);
}
parse_result_t parse_register_offset(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_plus_or_minus, parse_number, nullptr};
return parse_consecutive(current, NODE_REGISTER_OFFSET, parsers);
}
parse_result_t parse_register_expression(tokenlist_entry_t *current) {
parse_result_t result;
ast_node_t *expr;
error_t *err = ast_node_alloc(&expr);
if (err)
return parse_error(err);
expr->id = NODE_REGISTER_EXPRESSION;
// <register>
result = parse_register(current);
if (result.err) {
ast_node_free(expr);
return result;
}
err = ast_node_add_child(expr, result.node);
if (err) {
ast_node_free(result.node);
ast_node_free(expr);
return parse_error(err);
}
current = result.next;
// <register_index>?
result = parse_register_index(current);
if (result.err) {
error_free(result.err);
} else {
err = ast_node_add_child(expr, result.node);
if (err) {
ast_node_free(result.node);
ast_node_free(expr);
return parse_error(err);
}
current = result.next;
}
// <register_offset>?
result = parse_register_offset(current);
if (result.err) {
error_free(result.err);
} else {
err = ast_node_add_child(expr, result.node);
if (err) {
ast_node_free(result.node);
ast_node_free(expr);
return parse_error(err);
}
current = result.next;
}
return parse_success(expr, current);
}
parse_result_t parse_immediate(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_number, parse_identifier, nullptr};
parse_result_t result = parse_any(current, parsers);
return parse_result_wrap(NODE_IMMEDIATE, result);
}
parse_result_t parse_memory_expression(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_register_expression, parse_identifier, nullptr};
return parse_any(current, parsers);
}
parse_result_t parse_memory(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_lbracket, parse_memory_expression,
parse_rbracket, nullptr};
return parse_consecutive(current, NODE_MEMORY, parsers);
}
parse_result_t parse_operand(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_register, parse_memory, parse_immediate,
nullptr};
return parse_any(current, parsers);
}
parse_result_t parse_operands(tokenlist_entry_t *current) {
return parse_list(current, NODE_OPERANDS, true, TOKEN_COMMA, parse_operand);
}
parse_result_t parse_label(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_identifier, parse_colon, nullptr};
return parse_consecutive(current, NODE_LABEL, parsers);
}
parse_result_t parse_section_directive(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_section, parse_identifier, nullptr};
return parse_consecutive(current, NODE_SECTION_DIRECTIVE, parsers);
}
parse_result_t parse_directive(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_dot, parse_section_directive, nullptr};
return parse_consecutive(current, NODE_DIRECTIVE, parsers);
}
parse_result_t parse_instruction(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_identifier, parse_operands, nullptr};
return parse_consecutive(current, NODE_INSTRUCTION, parsers);
}
parse_result_t parse_statement(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_label, parse_directive, parse_instruction,
nullptr};
return parse_any(current, parsers);
}
parse_result_t parse(tokenlist_entry_t *current) {
return parse_many(current, NODE_PROGRAM, true, parse_statement);
}

View File

@ -1,9 +0,0 @@
#ifndef INCLUDE_PARSER_PARSER_H_
#define INCLUDE_PARSER_PARSER_H_
#include "../tokenlist.h"
#include "util.h"
parse_result_t parse(tokenlist_entry_t *current);
#endif // INCLUDE_PARSER_PARSER_H_

View File

@ -1,4 +1,4 @@
#include "combinators.h"
#include "parser_combinators.h"
// Parse a list of the given parser delimited by the given token id. Does not
// store the delimiters in the parent node

View File

@ -1,7 +1,4 @@
#ifndef INCLUDE_PARSER_COMBINATORS_H_
#define INCLUDE_PARSER_COMBINATORS_H_
#include "util.h"
#include "parser_util.h"
typedef parse_result_t (*parser_t)(tokenlist_entry_t *);
@ -21,5 +18,3 @@ parse_result_t parse_list(tokenlist_entry_t *current, node_id_t id,
// wraps the parsed nodes in a new parent node.
parse_result_t parse_consecutive(tokenlist_entry_t *current, node_id_t id,
parser_t parsers[]);
#endif // INCLUDE_PARSER_COMBINATORS_H_

View File

@ -1,5 +1,5 @@
#include "primitives.h"
#include "../ast.h"
#include "parser_primitives.h"
#include "ast.h"
#include <string.h>
parse_result_t parse_identifier(tokenlist_entry_t *current) {
@ -62,11 +62,6 @@ parse_result_t parse_dot(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_DOT, NODE_DOT, nullptr);
}
parse_result_t parse_label_reference(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_IDENTIFIER, NODE_LABEL_REFERENCE,
nullptr);
}
const char *registers[] = {
// 64-bit registers
"rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi", "r8", "r9", "r10",
@ -80,7 +75,6 @@ const char *registers[] = {
// 8-bit low registers
"al", "cl", "dl", "bl", "spl", "bpl", "sil", "dil", "r8b", "r9b", "r10b",
"r11b", "r12b", "r13b", "r14b", "r15b", nullptr};
bool is_register_token(lexer_token_t *token) {
for (size_t i = 0; registers[i] != nullptr; ++i)
if (strcmp(token->value, registers[i]) == 0)

View File

@ -1,7 +1,7 @@
#ifndef INCLUDE_PARSER_PRIMITIVES_H_
#define INCLUDE_PARSER_PRIMITIVES_H_
#ifndef INCLUDE_SRC_PARSER_PRIMITIVES_H_
#define INCLUDE_SRC_PARSER_PRIMITIVES_H_
#include "util.h"
#include "parser_util.h"
parse_result_t parse_identifier(tokenlist_entry_t *current);
parse_result_t parse_decimal(tokenlist_entry_t *current);
@ -18,7 +18,6 @@ parse_result_t parse_plus(tokenlist_entry_t *current);
parse_result_t parse_minus(tokenlist_entry_t *current);
parse_result_t parse_asterisk(tokenlist_entry_t *current);
parse_result_t parse_dot(tokenlist_entry_t *current);
parse_result_t parse_label_reference(tokenlist_entry_t *current);
/* These are "primitives" with a different name and some extra validation on top
* for example, register is just an identifier but it only matches a limited set
@ -27,4 +26,4 @@ parse_result_t parse_label_reference(tokenlist_entry_t *current);
parse_result_t parse_register(tokenlist_entry_t *current);
parse_result_t parse_section(tokenlist_entry_t *current);
#endif // INCLUDE_PARSER_PRIMITIVES_H_
#endif // INCLUDE_SRC_PARSER_PRIMITIVES_H_

View File

@ -1,5 +1,5 @@
#include "util.h"
#include "../tokenlist.h"
#include "parser_util.h"
#include "tokenlist.h"
error_t *err_parse_no_match =
&(error_t){.message = "parsing failed to find the correct token sequence"};
@ -33,24 +33,3 @@ parse_result_t parse_token(tokenlist_entry_t *current,
return parse_success(node, current->next);
}
parse_result_t parse_result_wrap(node_id_t id, parse_result_t result) {
if (result.err)
return result;
ast_node_t *node;
error_t *err = ast_node_alloc(&node);
if (err) {
ast_node_free(result.node);
return parse_error(err);
}
node->id = id;
err = ast_node_add_child(node, result.node);
if (err) {
ast_node_free(result.node);
return parse_error(err);
}
return parse_success(node, result.next);
}

View File

@ -1,9 +1,9 @@
#ifndef INCLUDE_PARSER_UTIL_H_
#define INCLUDE_PARSER_UTIL_H_
#ifndef INCLUDE_SRC_PARSER_UTIL_H_
#define INCLUDE_SRC_PARSER_UTIL_H_
#include "../ast.h"
#include "../error.h"
#include "../tokenlist.h"
#include "ast.h"
#include "error.h"
#include "tokenlist.h"
typedef struct parse_result {
error_t *err;
@ -19,8 +19,9 @@ parse_result_t parse_success(ast_node_t *ast, tokenlist_entry_t *next);
parse_result_t parse_token(tokenlist_entry_t *current,
lexer_token_id_t token_id, node_id_t ast_id,
token_validator_t is_valid);
parse_result_t parse_result_wrap(node_id_t id, parse_result_t result);
tokenlist_entry_t *skip_insignificant(tokenlist_entry_t *);
extern error_t *err_parse_no_match;
#endif // INCLUDE_PARSER_UTIL_H_
#endif // INCLUDE_SRC_PARSER_UTIL_H_

View File

@ -1,17 +1,8 @@
.section text
; Small valid code snippet that should contain all different AST nodes
_start:
mov eax, ebx
lea eax, [eax + ebx * 4 + 8]
lea eax, [eax + 8]
lea eax, [eax + ebx * 8]
lea eax, [esp - 24]
lea eax, [eax + ebx * 4 - 8]
lea eax, [_start]
mov eax, _start
mov eax, 555
mov eax, 555 ; move 555 into eax
push 0o777
xor eax, 0xDEADBEEF
and ecx, 0o770