Add regression test for parse zero operands at eof

Prune the parse tree of NODE_NEWLINE after parsing succeeds
Fix grammar not being able to disambiguate some instructions
2025-04-16 13:16:55 +02:00 · 2025-04-16 13:01:02 +02:00 · 2025-04-16 12:34:44 +02:00 · 2025-04-16 12:13:02 +02:00
10 changed files with 103 additions and 11 deletions
--- a/doc/parser_grammar.txt
+++ b/doc/parser_grammar.txt
@ -1,13 +1,13 @@
 <program>   ::= <statement>*
-<statement> ::= <label> | <directive> | <instruction>
+<statement> ::= <label> | <directive> | <instruction> | <newline>

 <label> ::= <identifier> <colon>

-<directive> ::= <dot> <section_directive>
+<directive> ::= <dot> <section_directive> <newline>

 <section_directive> ::= "section" <identifier>

-<instruction> ::= <identifier> <operands>
+<instruction> ::= <identifier> <operands> <newline>

 <operands> ::= <operand> ( <comma> <operand> )*

--- a/src/ast.c
+++ b/src/ast.c
@ -157,6 +157,8 @@ const char *ast_node_id_to_cstr(node_id_t id) {
        return "NODE_ASTERISK";
    case NODE_DOT:
        return "NODE_DOT";
+    case NODE_NEWLINE:
+        return "NODE_NEWLINE";
    }
    assert(!"Unreachable, weird node id" && id);
    __builtin_unreachable();
@ -172,7 +174,8 @@ static void ast_node_print_internal(ast_node_t *node, int indent) {
    }
    printf("%s", ast_node_id_to_cstr(node->id));

-    if (node->token_entry && node->token_entry->token.value) {
+    if (node->token_entry && node->token_entry->token.value &&
+        node->id != NODE_NEWLINE) {
        printf(" \"%s\"", node->token_entry->token.value);
    }
    printf("\n");
@ -185,3 +188,18 @@ static void ast_node_print_internal(ast_node_t *node, int indent) {
 void ast_node_print(ast_node_t *node) {
    ast_node_print_internal(node, 0);
 }
+
+void ast_node_prune(ast_node_t *node, node_id_t id) {
+    size_t new_len = 0;
+    for (size_t i = 0; i < node->len; i++) {
+        auto child = node->children[i];
+        if (child->id == id) {
+            ast_node_free(child);
+            continue;
+        }
+        ast_node_prune(child, id);
+        node->children[new_len] = child;
+        new_len++;
+    }
+    node->len = new_len;
+}
--- a/src/ast.h
+++ b/src/ast.h
@ -50,6 +50,7 @@ typedef enum node_id {
    NODE_MINUS,
    NODE_ASTERISK,
    NODE_DOT,
+    NODE_NEWLINE,
 } node_id_t;

 typedef struct ast_node ast_node_t;
@ -119,4 +120,17 @@ error_t *ast_node_add_child(ast_node_t *node, ast_node_t *child);
 */
 void ast_node_print(ast_node_t *node);

+/**
+ * Prune the children with a given id
+ *
+ * The tree is recursively visited and all child nodes of a given ID are pruned
+ * completely. If a node has the giver id, it will get removed along wih all its
+ * children, even if some of those children have different ids. The root node id
+ * is never checked so the tree is guaranteed to remain and allocated valid.
+ *
+ * @param node The root of the tree you want to prune
+ * @param id The id of the nodes you want to prune
+ */
+void ast_node_prune(ast_node_t *node, node_id_t id);
+
 #endif // INCLUDE_SRC_AST_H_
--- a/src/parser/combinators.c
+++ b/src/parser/combinators.c
@ -1,4 +1,5 @@
 #include "combinators.h"
+#include "util.h"

 // Parse a list of the given parser delimited by the given token id. Does not
 // store the delimiters in the parent node
@ -122,5 +123,12 @@ parse_result_t parse_consecutive(tokenlist_entry_t *current, node_id_t id,
        }
        current = result.next;
    }
+
+    // token stream ended before we matched all parsers
+    if (parser != nullptr) {
+        ast_node_free(all);
+        return parse_no_match();
+    }
+
    return parse_success(all, current);
 }
--- a/src/parser/parser.c
+++ b/src/parser/parser.c
@ -120,22 +120,28 @@ parse_result_t parse_section_directive(tokenlist_entry_t *current) {
 }

 parse_result_t parse_directive(tokenlist_entry_t *current) {
-    parser_t parsers[] = {parse_dot, parse_section_directive, nullptr};
+    parser_t parsers[] = {parse_dot, parse_section_directive, parse_newline,
+                          nullptr};
    return parse_consecutive(current, NODE_DIRECTIVE, parsers);
 }

 parse_result_t parse_instruction(tokenlist_entry_t *current) {
-    parser_t parsers[] = {parse_identifier, parse_operands, nullptr};
+    parser_t parsers[] = {parse_identifier, parse_operands, parse_newline,
+                          nullptr};
    return parse_consecutive(current, NODE_INSTRUCTION, parsers);
 }

 parse_result_t parse_statement(tokenlist_entry_t *current) {
    parser_t parsers[] = {parse_label, parse_directive, parse_instruction,
-                          nullptr};
+                          parse_newline, nullptr};
    return parse_any(current, parsers);
 }

 parse_result_t parse(tokenlist_entry_t *current) {
    current = tokenlist_skip_trivia(current);
-    return parse_many(current, NODE_PROGRAM, true, parse_statement);
+    parse_result_t result =
+        parse_many(current, NODE_PROGRAM, true, parse_statement);
+    if (result.node != nullptr)
+        ast_node_prune(result.node, NODE_NEWLINE);
+    return result;
 }
--- a/src/parser/primitives.c
+++ b/src/parser/primitives.c
@ -62,6 +62,10 @@ parse_result_t parse_dot(tokenlist_entry_t *current) {
    return parse_token(current, TOKEN_DOT, NODE_DOT, nullptr);
 }

+parse_result_t parse_newline(tokenlist_entry_t *current) {
+    return parse_token(current, TOKEN_NEWLINE, NODE_NEWLINE, nullptr);
+}
+
 parse_result_t parse_label_reference(tokenlist_entry_t *current) {
    return parse_token(current, TOKEN_IDENTIFIER, NODE_LABEL_REFERENCE,
                       nullptr);
--- a/src/parser/primitives.h
+++ b/src/parser/primitives.h
@ -18,6 +18,7 @@ parse_result_t parse_plus(tokenlist_entry_t *current);
 parse_result_t parse_minus(tokenlist_entry_t *current);
 parse_result_t parse_asterisk(tokenlist_entry_t *current);
 parse_result_t parse_dot(tokenlist_entry_t *current);
+parse_result_t parse_newline(tokenlist_entry_t *current);
 parse_result_t parse_label_reference(tokenlist_entry_t *current);

 /* These are "primitives" with a different name and some extra validation on top
--- a/src/tokenlist.c
+++ b/src/tokenlist.c
@ -86,7 +86,6 @@ bool is_trivia(tokenlist_entry_t *trivia) {
    switch (trivia->token.id) {
    case TOKEN_WHITESPACE:
    case TOKEN_COMMENT:
-    case TOKEN_NEWLINE:
        return true;
    default:
        return false;
--- a/tests/input/regression/test_no_operands_eof.asm
+++ b/tests/input/regression/test_no_operands_eof.asm
@ -0,0 +1,5 @@
+; regression test for two issues:
+;  - parsing two zero operand instructions in a row
+;  - a zero operand instruction just before eof
+    syscall
+    ret
--- a/tests/regression.c
+++ b/tests/regression.c
@ -23,9 +23,46 @@ MunitResult test_regression_trivia_head(const MunitParameter params[], void *dat

    ast_node_free(result.node);
    tokenlist_free(list);
+    return MUNIT_OK;
+}
+
+MunitResult test_no_operands_eof(const MunitParameter params[], void *data) {
+    (void)params;
+    (void)data;
+
+    lexer_t *lex = &(lexer_t){};
+    error_t *err = lexer_open(lex, "tests/input/regression/test_no_operands_eof.asm");
+    munit_assert_null(err);
+
+    tokenlist_t *list;
+    err = tokenlist_alloc(&list);
+    munit_assert_null(err);
+
+    err = tokenlist_fill(list, lex);
+    munit_assert_null(err);
+
+    parse_result_t result = parse(list->head);
+    munit_assert_null(result.err);
+    munit_assert_null(result.next);
+
+    // Both children should be instructions
+    munit_assert_size(result.node->len, ==, 2);
+    munit_assert_int(result.node->children[0]->id, ==, NODE_INSTRUCTION);
+    munit_assert_int(result.node->children[1]->id, ==, NODE_INSTRUCTION);
+
+    // And they should have empty operands
+    munit_assert_size(result.node->children[0]->len, ==, 2);
+    munit_assert_size(result.node->children[1]->len, ==, 2);
+    munit_assert_size(result.node->children[0]->children[1]->len, ==, 0);
+    munit_assert_size(result.node->children[1]->children[1]->len, ==, 0);
+
+    ast_node_free(result.node);
+    tokenlist_free(list);
+    return MUNIT_OK;
 }

 MunitTest regression_tests[] = {
-    {"/trivia_head", test_regression_trivia_head, nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr},
-    {nullptr,        nullptr,                     nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr}
+    {"/trivia_head",     test_regression_trivia_head, nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr},
+    {"/no_operands_eof", test_no_operands_eof,        nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr},
+    {nullptr,            nullptr,                     nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr}
 };
Author	SHA1	Message	Date
omicron	00272d69bf	Add regression test for parse zero operands at eof All checks were successful Validate the build / validate-build (push) Successful in 30s Details	2025-04-16 13:16:55 +02:00
omicron	2385d38608	Prune the parse tree of NODE_NEWLINE after parsing succeeds	2025-04-16 13:01:02 +02:00
omicron	242fd9baa5	Fix grammar not being able to disambiguate some instructions When two identifiers follow eachother it could be two instruction mnemonics or one instruction mnemonic and one operand. To fix this TOKEN_NEWLINE has been reintroduced as a semantic token. The grammar has been changed to allow empty statements and every instruction and directive has to end in a newline. Labels do not have to end in a newline. In addition to updating the grammar, the implementation of tokenlist, ast and parser has been updated to reflect these changes.	2025-04-16 12:34:44 +02:00
omicron	1574ec6249	Fix parse_consecutive behavior when the token stream runs out	2025-04-16 12:13:02 +02:00