Fix grammar not being able to disambiguate some instructions

When two identifiers follow eachother it could be two instruction mnemonics or one instruction mnemonic and one operand. To fix this TOKEN_NEWLINE has been reintroduced as a semantic token. The grammar has been changed to allow empty statements and every instruction and directive has to end in a newline. Labels do not have to end in a newline. In addition to updating the grammar, the implementation of tokenlist, ast and parser has been updated to reflect these changes.
2025-04-16 12:34:44 +02:00 · 2025-04-16 12:34:44 +02:00 · 242fd9baa5
commit 242fd9baa5
parent 1574ec6249
7 changed files with 18 additions and 8 deletions
--- a/doc/parser_grammar.txt
+++ b/doc/parser_grammar.txt
@ -1,13 +1,13 @@
 <program>   ::= <statement>*
-<statement> ::= <label> | <directive> | <instruction>
+<statement> ::= <label> | <directive> | <instruction> | <newline>

 <label> ::= <identifier> <colon>

-<directive> ::= <dot> <section_directive>
+<directive> ::= <dot> <section_directive> <newline>

 <section_directive> ::= "section" <identifier>

-<instruction> ::= <identifier> <operands>
+<instruction> ::= <identifier> <operands> <newline>

 <operands> ::= <operand> ( <comma> <operand> )*

--- a/src/ast.c
+++ b/src/ast.c
@ -157,6 +157,8 @@ const char *ast_node_id_to_cstr(node_id_t id) {
        return "NODE_ASTERISK";
    case NODE_DOT:
        return "NODE_DOT";
+    case NODE_NEWLINE:
+        return "NODE_NEWLINE";
    }
    assert(!"Unreachable, weird node id" && id);
    __builtin_unreachable();
@ -172,7 +174,8 @@ static void ast_node_print_internal(ast_node_t *node, int indent) {
    }
    printf("%s", ast_node_id_to_cstr(node->id));

-    if (node->token_entry && node->token_entry->token.value) {
+    if (node->token_entry && node->token_entry->token.value &&
+        node->id != NODE_NEWLINE) {
        printf(" \"%s\"", node->token_entry->token.value);
    }
    printf("\n");
--- a/src/ast.h
+++ b/src/ast.h
@ -50,6 +50,7 @@ typedef enum node_id {
    NODE_MINUS,
    NODE_ASTERISK,
    NODE_DOT,
+    NODE_NEWLINE,
 } node_id_t;

 typedef struct ast_node ast_node_t;
--- a/src/parser/parser.c
+++ b/src/parser/parser.c
@ -120,18 +120,20 @@ parse_result_t parse_section_directive(tokenlist_entry_t *current) {
 }

 parse_result_t parse_directive(tokenlist_entry_t *current) {
-    parser_t parsers[] = {parse_dot, parse_section_directive, nullptr};
+    parser_t parsers[] = {parse_dot, parse_section_directive, parse_newline,
+                          nullptr};
    return parse_consecutive(current, NODE_DIRECTIVE, parsers);
 }

 parse_result_t parse_instruction(tokenlist_entry_t *current) {
-    parser_t parsers[] = {parse_identifier, parse_operands, nullptr};
+    parser_t parsers[] = {parse_identifier, parse_operands, parse_newline,
+                          nullptr};
    return parse_consecutive(current, NODE_INSTRUCTION, parsers);
 }

 parse_result_t parse_statement(tokenlist_entry_t *current) {
    parser_t parsers[] = {parse_label, parse_directive, parse_instruction,
-                          nullptr};
+                          parse_newline, nullptr};
    return parse_any(current, parsers);
 }

--- a/src/parser/primitives.c
+++ b/src/parser/primitives.c
@ -62,6 +62,10 @@ parse_result_t parse_dot(tokenlist_entry_t *current) {
    return parse_token(current, TOKEN_DOT, NODE_DOT, nullptr);
 }

+parse_result_t parse_newline(tokenlist_entry_t *current) {
+    return parse_token(current, TOKEN_NEWLINE, NODE_NEWLINE, nullptr);
+}
+
 parse_result_t parse_label_reference(tokenlist_entry_t *current) {
    return parse_token(current, TOKEN_IDENTIFIER, NODE_LABEL_REFERENCE,
                       nullptr);
--- a/src/parser/primitives.h
+++ b/src/parser/primitives.h
@ -18,6 +18,7 @@ parse_result_t parse_plus(tokenlist_entry_t *current);
 parse_result_t parse_minus(tokenlist_entry_t *current);
 parse_result_t parse_asterisk(tokenlist_entry_t *current);
 parse_result_t parse_dot(tokenlist_entry_t *current);
+parse_result_t parse_newline(tokenlist_entry_t *current);
 parse_result_t parse_label_reference(tokenlist_entry_t *current);

 /* These are "primitives" with a different name and some extra validation on top
--- a/src/tokenlist.c
+++ b/src/tokenlist.c
@ -86,7 +86,6 @@ bool is_trivia(tokenlist_entry_t *trivia) {
    switch (trivia->token.id) {
    case TOKEN_WHITESPACE:
    case TOKEN_COMMENT:
-    case TOKEN_NEWLINE:
        return true;
    default:
        return false;