Compare commits

...

4 Commits

Author SHA1 Message Date
00272d69bf Add regression test for parse zero operands at eof
All checks were successful
Validate the build / validate-build (push) Successful in 30s
2025-04-16 13:16:55 +02:00
2385d38608 Prune the parse tree of NODE_NEWLINE after parsing succeeds 2025-04-16 13:01:02 +02:00
242fd9baa5 Fix grammar not being able to disambiguate some instructions
When two identifiers follow eachother it could be two instruction
mnemonics or one instruction mnemonic and one operand. To fix this
TOKEN_NEWLINE has been reintroduced as a semantic token. The grammar has
been changed to allow empty statements and every instruction and
directive has to end in a newline. Labels do not have to end in a
newline.

In addition to updating the grammar, the implementation of tokenlist,
ast and parser has been updated to reflect these changes.
2025-04-16 12:34:44 +02:00
1574ec6249 Fix parse_consecutive behavior when the token stream runs out 2025-04-16 12:13:02 +02:00
10 changed files with 103 additions and 11 deletions

View File

@ -1,13 +1,13 @@
<program> ::= <statement>*
<statement> ::= <label> | <directive> | <instruction>
<statement> ::= <label> | <directive> | <instruction> | <newline>
<label> ::= <identifier> <colon>
<directive> ::= <dot> <section_directive>
<directive> ::= <dot> <section_directive> <newline>
<section_directive> ::= "section" <identifier>
<instruction> ::= <identifier> <operands>
<instruction> ::= <identifier> <operands> <newline>
<operands> ::= <operand> ( <comma> <operand> )*

View File

@ -157,6 +157,8 @@ const char *ast_node_id_to_cstr(node_id_t id) {
return "NODE_ASTERISK";
case NODE_DOT:
return "NODE_DOT";
case NODE_NEWLINE:
return "NODE_NEWLINE";
}
assert(!"Unreachable, weird node id" && id);
__builtin_unreachable();
@ -172,7 +174,8 @@ static void ast_node_print_internal(ast_node_t *node, int indent) {
}
printf("%s", ast_node_id_to_cstr(node->id));
if (node->token_entry && node->token_entry->token.value) {
if (node->token_entry && node->token_entry->token.value &&
node->id != NODE_NEWLINE) {
printf(" \"%s\"", node->token_entry->token.value);
}
printf("\n");
@ -185,3 +188,18 @@ static void ast_node_print_internal(ast_node_t *node, int indent) {
void ast_node_print(ast_node_t *node) {
ast_node_print_internal(node, 0);
}
void ast_node_prune(ast_node_t *node, node_id_t id) {
size_t new_len = 0;
for (size_t i = 0; i < node->len; i++) {
auto child = node->children[i];
if (child->id == id) {
ast_node_free(child);
continue;
}
ast_node_prune(child, id);
node->children[new_len] = child;
new_len++;
}
node->len = new_len;
}

View File

@ -50,6 +50,7 @@ typedef enum node_id {
NODE_MINUS,
NODE_ASTERISK,
NODE_DOT,
NODE_NEWLINE,
} node_id_t;
typedef struct ast_node ast_node_t;
@ -119,4 +120,17 @@ error_t *ast_node_add_child(ast_node_t *node, ast_node_t *child);
*/
void ast_node_print(ast_node_t *node);
/**
* Prune the children with a given id
*
* The tree is recursively visited and all child nodes of a given ID are pruned
* completely. If a node has the giver id, it will get removed along wih all its
* children, even if some of those children have different ids. The root node id
* is never checked so the tree is guaranteed to remain and allocated valid.
*
* @param node The root of the tree you want to prune
* @param id The id of the nodes you want to prune
*/
void ast_node_prune(ast_node_t *node, node_id_t id);
#endif // INCLUDE_SRC_AST_H_

View File

@ -1,4 +1,5 @@
#include "combinators.h"
#include "util.h"
// Parse a list of the given parser delimited by the given token id. Does not
// store the delimiters in the parent node
@ -122,5 +123,12 @@ parse_result_t parse_consecutive(tokenlist_entry_t *current, node_id_t id,
}
current = result.next;
}
// token stream ended before we matched all parsers
if (parser != nullptr) {
ast_node_free(all);
return parse_no_match();
}
return parse_success(all, current);
}

View File

@ -120,22 +120,28 @@ parse_result_t parse_section_directive(tokenlist_entry_t *current) {
}
parse_result_t parse_directive(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_dot, parse_section_directive, nullptr};
parser_t parsers[] = {parse_dot, parse_section_directive, parse_newline,
nullptr};
return parse_consecutive(current, NODE_DIRECTIVE, parsers);
}
parse_result_t parse_instruction(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_identifier, parse_operands, nullptr};
parser_t parsers[] = {parse_identifier, parse_operands, parse_newline,
nullptr};
return parse_consecutive(current, NODE_INSTRUCTION, parsers);
}
parse_result_t parse_statement(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_label, parse_directive, parse_instruction,
nullptr};
parse_newline, nullptr};
return parse_any(current, parsers);
}
parse_result_t parse(tokenlist_entry_t *current) {
current = tokenlist_skip_trivia(current);
return parse_many(current, NODE_PROGRAM, true, parse_statement);
parse_result_t result =
parse_many(current, NODE_PROGRAM, true, parse_statement);
if (result.node != nullptr)
ast_node_prune(result.node, NODE_NEWLINE);
return result;
}

View File

@ -62,6 +62,10 @@ parse_result_t parse_dot(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_DOT, NODE_DOT, nullptr);
}
parse_result_t parse_newline(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_NEWLINE, NODE_NEWLINE, nullptr);
}
parse_result_t parse_label_reference(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_IDENTIFIER, NODE_LABEL_REFERENCE,
nullptr);

View File

@ -18,6 +18,7 @@ parse_result_t parse_plus(tokenlist_entry_t *current);
parse_result_t parse_minus(tokenlist_entry_t *current);
parse_result_t parse_asterisk(tokenlist_entry_t *current);
parse_result_t parse_dot(tokenlist_entry_t *current);
parse_result_t parse_newline(tokenlist_entry_t *current);
parse_result_t parse_label_reference(tokenlist_entry_t *current);
/* These are "primitives" with a different name and some extra validation on top

View File

@ -86,7 +86,6 @@ bool is_trivia(tokenlist_entry_t *trivia) {
switch (trivia->token.id) {
case TOKEN_WHITESPACE:
case TOKEN_COMMENT:
case TOKEN_NEWLINE:
return true;
default:
return false;

View File

@ -0,0 +1,5 @@
; regression test for two issues:
; - parsing two zero operand instructions in a row
; - a zero operand instruction just before eof
syscall
ret

View File

@ -23,9 +23,46 @@ MunitResult test_regression_trivia_head(const MunitParameter params[], void *dat
ast_node_free(result.node);
tokenlist_free(list);
return MUNIT_OK;
}
MunitResult test_no_operands_eof(const MunitParameter params[], void *data) {
(void)params;
(void)data;
lexer_t *lex = &(lexer_t){};
error_t *err = lexer_open(lex, "tests/input/regression/test_no_operands_eof.asm");
munit_assert_null(err);
tokenlist_t *list;
err = tokenlist_alloc(&list);
munit_assert_null(err);
err = tokenlist_fill(list, lex);
munit_assert_null(err);
parse_result_t result = parse(list->head);
munit_assert_null(result.err);
munit_assert_null(result.next);
// Both children should be instructions
munit_assert_size(result.node->len, ==, 2);
munit_assert_int(result.node->children[0]->id, ==, NODE_INSTRUCTION);
munit_assert_int(result.node->children[1]->id, ==, NODE_INSTRUCTION);
// And they should have empty operands
munit_assert_size(result.node->children[0]->len, ==, 2);
munit_assert_size(result.node->children[1]->len, ==, 2);
munit_assert_size(result.node->children[0]->children[1]->len, ==, 0);
munit_assert_size(result.node->children[1]->children[1]->len, ==, 0);
ast_node_free(result.node);
tokenlist_free(list);
return MUNIT_OK;
}
MunitTest regression_tests[] = {
{"/trivia_head", test_regression_trivia_head, nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr},
{nullptr, nullptr, nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr}
{"/trivia_head", test_regression_trivia_head, nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr},
{"/no_operands_eof", test_no_operands_eof, nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr},
{nullptr, nullptr, nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr}
};