2025-04-16 11:32:47 +00:00
10 changed files with 103 additions and 11 deletions
--- a/doc/parser_grammar.txt
+++ b/doc/parser_grammar.txt
@@ -1,13 +1,13 @@
 <program>   ::= <statement>*
-<statement> ::= <label> | <directive> | <instruction>
+<statement> ::= <label> | <directive> | <instruction> | <newline>
 <label> ::= <identifier> <colon>
-<directive> ::= <dot> <section_directive>
+<directive> ::= <dot> <section_directive> <newline>
 <section_directive> ::= "section" <identifier>
-<instruction> ::= <identifier> <operands>
+<instruction> ::= <identifier> <operands> <newline>
 <operands> ::= <operand> ( <comma> <operand> )*
--- a/src/ast.c
+++ b/src/ast.c
@@ -157,6 +157,8 @@ const char *ast_node_id_to_cstr(node_id_t id) {
        return "NODE_ASTERISK";
    case NODE_DOT:
        return "NODE_DOT";
    case NODE_NEWLINE:
        return "NODE_NEWLINE";
    }
    assert(!"Unreachable, weird node id" && id);
    __builtin_unreachable();
@@ -172,7 +174,8 @@ static void ast_node_print_internal(ast_node_t *node, int indent) {
    }
    printf("%s", ast_node_id_to_cstr(node->id));
-    if (node->token_entry && node->token_entry->token.value) {
+    if (node->token_entry && node->token_entry->token.value &&
        node->id != NODE_NEWLINE) {
        printf(" \"%s\"", node->token_entry->token.value);
    }
    printf("\n");
@@ -185,3 +188,18 @@ static void ast_node_print_internal(ast_node_t *node, int indent) {
 void ast_node_print(ast_node_t *node) {
    ast_node_print_internal(node, 0);
 }
 void ast_node_prune(ast_node_t *node, node_id_t id) {
    size_t new_len = 0;
    for (size_t i = 0; i < node->len; i++) {
        auto child = node->children[i];
        if (child->id == id) {
            ast_node_free(child);
            continue;
        }
        ast_node_prune(child, id);
        node->children[new_len] = child;
        new_len++;
    }
    node->len = new_len;
 }
--- a/src/ast.h
+++ b/src/ast.h
@@ -50,6 +50,7 @@ typedef enum node_id {
    NODE_MINUS,
    NODE_ASTERISK,
    NODE_DOT,
    NODE_NEWLINE,
 } node_id_t;
 typedef struct ast_node ast_node_t;
@@ -119,4 +120,17 @@ error_t *ast_node_add_child(ast_node_t *node, ast_node_t *child);
 */
 void ast_node_print(ast_node_t *node);
 /**
 * Prune the children with a given id
 *
 * The tree is recursively visited and all child nodes of a given ID are pruned
 * completely. If a node has the giver id, it will get removed along wih all its
 * children, even if some of those children have different ids. The root node id
 * is never checked so the tree is guaranteed to remain and allocated valid.
 *
 * @param node The root of the tree you want to prune
 * @param id The id of the nodes you want to prune
 */
 void ast_node_prune(ast_node_t *node, node_id_t id);
 #endif // INCLUDE_SRC_AST_H_
--- a/src/parser/combinators.c
+++ b/src/parser/combinators.c
@@ -1,4 +1,5 @@
 #include "combinators.h"
 #include "util.h"
 // Parse a list of the given parser delimited by the given token id. Does not
 // store the delimiters in the parent node
@@ -122,5 +123,12 @@ parse_result_t parse_consecutive(tokenlist_entry_t *current, node_id_t id,
        }
        current = result.next;
    }
    // token stream ended before we matched all parsers
    if (parser != nullptr) {
        ast_node_free(all);
        return parse_no_match();
    }
    return parse_success(all, current);
 }
--- a/src/parser/parser.c
+++ b/src/parser/parser.c
@@ -120,22 +120,28 @@ parse_result_t parse_section_directive(tokenlist_entry_t *current) {
 }
 parse_result_t parse_directive(tokenlist_entry_t *current) {
-    parser_t parsers[] = {parse_dot, parse_section_directive, nullptr};
+    parser_t parsers[] = {parse_dot, parse_section_directive, parse_newline,
                          nullptr};
    return parse_consecutive(current, NODE_DIRECTIVE, parsers);
 }
 parse_result_t parse_instruction(tokenlist_entry_t *current) {
-    parser_t parsers[] = {parse_identifier, parse_operands, nullptr};
+    parser_t parsers[] = {parse_identifier, parse_operands, parse_newline,
                          nullptr};
    return parse_consecutive(current, NODE_INSTRUCTION, parsers);
 }
 parse_result_t parse_statement(tokenlist_entry_t *current) {
    parser_t parsers[] = {parse_label, parse_directive, parse_instruction,
-                          nullptr};
+                          parse_newline, nullptr};
    return parse_any(current, parsers);
 }
 parse_result_t parse(tokenlist_entry_t *current) {
    current = tokenlist_skip_trivia(current);
-    return parse_many(current, NODE_PROGRAM, true, parse_statement);
+    parse_result_t result =
        parse_many(current, NODE_PROGRAM, true, parse_statement);
    if (result.node != nullptr)
        ast_node_prune(result.node, NODE_NEWLINE);
    return result;
 }
--- a/src/parser/primitives.c
+++ b/src/parser/primitives.c
@@ -62,6 +62,10 @@ parse_result_t parse_dot(tokenlist_entry_t *current) {
    return parse_token(current, TOKEN_DOT, NODE_DOT, nullptr);
 }
 parse_result_t parse_newline(tokenlist_entry_t *current) {
    return parse_token(current, TOKEN_NEWLINE, NODE_NEWLINE, nullptr);
 }
 parse_result_t parse_label_reference(tokenlist_entry_t *current) {
    return parse_token(current, TOKEN_IDENTIFIER, NODE_LABEL_REFERENCE,
                       nullptr);
--- a/src/parser/primitives.h
+++ b/src/parser/primitives.h
@@ -18,6 +18,7 @@ parse_result_t parse_plus(tokenlist_entry_t *current);
 parse_result_t parse_minus(tokenlist_entry_t *current);
 parse_result_t parse_asterisk(tokenlist_entry_t *current);
 parse_result_t parse_dot(tokenlist_entry_t *current);
 parse_result_t parse_newline(tokenlist_entry_t *current);
 parse_result_t parse_label_reference(tokenlist_entry_t *current);
 /* These are "primitives" with a different name and some extra validation on top
--- a/src/tokenlist.c
+++ b/src/tokenlist.c
@@ -86,7 +86,6 @@ bool is_trivia(tokenlist_entry_t *trivia) {
    switch (trivia->token.id) {
    case TOKEN_WHITESPACE:
    case TOKEN_COMMENT:
    case TOKEN_NEWLINE:
        return true;
    default:
        return false;
--- a/tests/input/regression/test_no_operands_eof.asm
+++ b/tests/input/regression/test_no_operands_eof.asm
@@ -0,0 +1,5 @@
 ; regression test for two issues:
 ;  - parsing two zero operand instructions in a row
 ;  - a zero operand instruction just before eof
    syscall
    ret
--- a/tests/regression.c
+++ b/tests/regression.c
@@ -23,9 +23,46 @@ MunitResult test_regression_trivia_head(const MunitParameter params[], void *dat
    ast_node_free(result.node);
    tokenlist_free(list);
    return MUNIT_OK;
 }
 MunitResult test_no_operands_eof(const MunitParameter params[], void *data) {
    (void)params;
    (void)data;
    lexer_t *lex = &(lexer_t){};
    error_t *err = lexer_open(lex, "tests/input/regression/test_no_operands_eof.asm");
    munit_assert_null(err);
    tokenlist_t *list;
    err = tokenlist_alloc(&list);
    munit_assert_null(err);
    err = tokenlist_fill(list, lex);
    munit_assert_null(err);
    parse_result_t result = parse(list->head);
    munit_assert_null(result.err);
    munit_assert_null(result.next);
    // Both children should be instructions
    munit_assert_size(result.node->len, ==, 2);
    munit_assert_int(result.node->children[0]->id, ==, NODE_INSTRUCTION);
    munit_assert_int(result.node->children[1]->id, ==, NODE_INSTRUCTION);
    // And they should have empty operands
    munit_assert_size(result.node->children[0]->len, ==, 2);
    munit_assert_size(result.node->children[1]->len, ==, 2);
    munit_assert_size(result.node->children[0]->children[1]->len, ==, 0);
    munit_assert_size(result.node->children[1]->children[1]->len, ==, 0);
    ast_node_free(result.node);
    tokenlist_free(list);
    return MUNIT_OK;
 }
 MunitTest regression_tests[] = {
    {"/trivia_head",     test_regression_trivia_head, nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr},
    {"/no_operands_eof", test_no_operands_eof,        nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr},
    {nullptr,            nullptr,                     nullptr, nullptr, MUNIT_TEST_OPTION_NONE, nullptr}
 };