Fix parse_operand

More grammar
Slightly change the valid test input file
2025-04-02 15:49:49 +02:00 · 2025-04-02 15:39:41 +02:00 · 2025-04-02 13:00:06 +02:00 · 2025-04-02 13:00:01 +02:00 · 2025-04-02 12:59:55 +02:00 · 2025-04-02 12:59:55 +02:00
13 changed files with 554 additions and 30 deletions
--- a/2
+++ b/2
@@ -10,7 +10,7 @@ OBJECTS = $(SOURCES:.c=.o)
 DEPENDENCIES = $(SOURCES:.c=.d)
 TARGET?=oas
 OUTPUTS=oas oas-asan oas-msan oas-afl
-RUNARGUMENTS?=-tokens tests/input/valid.asm
+RUNARGUMENTS?=ast tests/input/valid.asm
 all: $(TARGET)
--- a/src/lexer.c
+++ b/src/lexer.c
@@ -183,7 +183,7 @@ error_t *lexer_consume_n(lexer_t *lex, const size_t len,
                         char buffer[static len], const size_t n) {
    if (lex->buffer_count < n)
        return err_buffer_underrun;
-    if (len > n)
+    if (n > len)
        return err_consume_excessive_length;
    memcpy(buffer, lex->buffer, n);
--- a/src/main.c
+++ b/src/main.c
@@ -1,5 +1,6 @@
 #include "error.h"
 #include "lexer.h"
 #include "parser/parser.h"
 #include "tokenlist.h"
 #include <limits.h>
@@ -7,38 +8,64 @@
 #include <stdlib.h>
 #include <string.h>
-bool print_token(lexer_token_t *token) {
+typedef enum mode { MODE_AST, MODE_TEXT, MODE_TOKENS } mode_t;
-    lexer_token_print(token);
+
-    return true;
+void print_tokens(tokenlist_t *list) {
    for (auto entry = list->head; entry; entry = entry->next) {
        auto token = &entry->token;
        lexer_token_print(token);
    }
 }
-bool print_value(lexer_token_t *token) {
+void print_text(tokenlist_t *list) {
-    if (token->id == TOKEN_ERROR) {
+    for (auto entry = list->head; entry; entry = entry->next) {
-        printf("%s\n", token->value);
+        auto token = &entry->token;
-        for (size_t i = 0; i < token->character_number; ++i)
+        if (token->id == TOKEN_ERROR) {
-            printf(" ");
+            printf("%s\n", token->value);
-        printf("^-- %s\n", token->explanation);
+            for (size_t i = 0; i < token->character_number; ++i)
-    } else {
+                printf(" ");
-        printf("%s", token->value);
+            printf("^-- %s\n", token->explanation);
            return;
        } else {
            printf("%s", token->value);
        }
    }
-    return token->id != TOKEN_ERROR;
+}
 void print_ast(tokenlist_t *list) {
    parse_result_t result = parse(list->head);
    if (result.err) {
        puts(result.err->message);
        error_free(result.err);
        return;
    }
    ast_node_print(result.node);
    if (result.next != nullptr) {
        puts("First unparsed token:");
        lexer_token_print(&result.next->token);
    }
    ast_node_free(result.node);
 }
 int get_execution_mode(int argc, char *argv[]) {
    if (argc != 3 || (strcmp(argv[1], "tokens") != 0 &&
                      strcmp(argv[1], "text") != 0 && strcmp(argv[1], "ast"))) {
        puts("Usage: oas [tokens|text|ast] <filename>");
        exit(1);
    }
    if (strcmp(argv[1], "tokens") == 0)
        return MODE_TOKENS;
    if (strcmp(argv[1], "text") == 0)
        return MODE_TEXT;
    return MODE_AST;
 }
 int main(int argc, char *argv[]) {
-    if (argc != 3 ||
+    mode_t mode = get_execution_mode(argc, argv);
        (strcmp(argv[1], "-tokens") != 0 && strcmp(argv[1], "-text") != 0)) {
        puts("Usage: oas -tokens <filename>");
        puts("Usage: oas -text <filename>");
        return 1;
    }
    bool (*print_fn)(lexer_token_t *);
    char *filename = argv[2];
    if (strcmp(argv[1], "-tokens") == 0) {
        print_fn = print_token;
    } else {
        print_fn = print_value;
    }
    lexer_t *lex = &(lexer_t){};
    error_t *err = lexer_open(lex, filename);
@@ -54,9 +81,18 @@ int main(int argc, char *argv[]) {
    if (err)
        goto cleanup_tokens;
-    for (auto entry = list->head; entry; entry = entry->next) {
+    switch (mode) {
-        print_fn(&entry->token);
+    case MODE_TOKENS:
        print_tokens(list);
        break;
    case MODE_TEXT:
        print_text(list);
        break;
    case MODE_AST:
        print_ast(list);
        break;
    }
    tokenlist_free(list);
    error_free(err);
    return 0;
--- a/src/parser/combinators.c
+++ b/src/parser/combinators.c
@@ -0,0 +1,126 @@
 #include "combinators.h"
 // Parse a list of the given parser delimited by the given token id. Does not
 // store the delimiters in the parent node
 parse_result_t parse_list(tokenlist_entry_t *current, node_id_t id,
                          bool allow_none, lexer_token_id_t delimiter_id,
                          parser_t parser) {
    ast_node_t *many;
    error_t *err = ast_node_alloc(&many);
    parse_result_t result;
    if (err)
        return parse_error(err);
    many->id = id;
    while (current) {
        // Skip beyond the delimiter on all but the first iteration
        if (many->len > 0) {
            if (current->token.id != delimiter_id)
                break;
            current = tokenlist_next(current);
            if (current == nullptr) {
                // FIXME: this isn't quite right, we can't consume the delimiter
                // if the next element will fail to parse but it's late and I
                // must think this through tomorrow
                break;
            }
        }
        result = parser(current);
        if (result.err == err_parse_no_match)
            break;
        if (result.err) {
            ast_node_free(many);
            return result;
        }
        err = ast_node_add_child(many, result.node);
        if (err) {
            ast_node_free(many);
            ast_node_free(result.node);
            return parse_error(err);
        }
        current = result.next;
    }
    if (!allow_none && many->len == 0) {
        ast_node_free(many);
        return parse_no_match();
    }
    return parse_success(many, current);
 }
 parse_result_t parse_any(tokenlist_entry_t *current, parser_t parsers[]) {
    parser_t parser;
    while ((parser = *parsers++)) {
        parse_result_t result = parser(current);
        if (result.err == nullptr)
            return result;
    }
    return parse_no_match();
 }
 // parse as many of the giver parsers objects in a row as possible,
 // potentially allowing none wraps the found objects in a new ast node with
 // the given note id
 parse_result_t parse_many(tokenlist_entry_t *current, node_id_t id,
                          bool allow_none, parser_t parser) {
    ast_node_t *many;
    error_t *err = ast_node_alloc(&many);
    parse_result_t result;
    if (err)
        return parse_error(err);
    many->id = id;
    while (current) {
        result = parser(current);
        if (result.err == err_parse_no_match)
            break;
        if (result.err) {
            ast_node_free(many);
            return result;
        }
        err = ast_node_add_child(many, result.node);
        if (err) {
            ast_node_free(many);
            ast_node_free(result.node);
            return parse_error(err);
        }
        current = result.next;
    }
    if (!allow_none && many->len == 0) {
        ast_node_free(many);
        return parse_no_match();
    }
    return parse_success(many, current);
 }
 // Parse all tries to parse all parsers consecutively and if it succeeds it
 // wraps the parsed nodes in a new parent node.
 parse_result_t parse_consecutive(tokenlist_entry_t *current, node_id_t id,
                                 parser_t parsers[]) {
    ast_node_t *all;
    error_t *err = ast_node_alloc(&all);
    parse_result_t result;
    if (err)
        return parse_error(err);
    all->id = id;
    parser_t parser;
    while ((parser = *parsers++) && current) {
        result = parser(current);
        if (result.err) {
            ast_node_free(all);
            return result;
        }
        err = ast_node_add_child(all, result.node);
        if (err) {
            ast_node_free(result.node);
            ast_node_free(all);
            return parse_error(err);
        }
        current = result.next;
    }
    return parse_success(all, current);
 }
--- a/src/parser/combinators.h
+++ b/src/parser/combinators.h
@@ -0,0 +1,25 @@
 #ifndef INCLUDE_PARSER_COMBINATORS_H_
 #define INCLUDE_PARSER_COMBINATORS_H_
 #include "util.h"
 typedef parse_result_t (*parser_t)(tokenlist_entry_t *);
 parse_result_t parse_any(tokenlist_entry_t *current, parser_t parsers[]);
 // parse as many of the giver parsers objects in a row as possible, potentially
 // allowing none wraps the found objects in a new ast node with the given note
 // id
 parse_result_t parse_many(tokenlist_entry_t *current, node_id_t id,
                          bool allow_none, parser_t parser);
 parse_result_t parse_list(tokenlist_entry_t *current, node_id_t id,
                          bool allow_none, lexer_token_id_t delimiter_id,
                          parser_t parser);
 // Parse all tries to parse all parsers consecutively and if it succeeds it
 // wraps the parsed nodes in a new parent node.
 parse_result_t parse_consecutive(tokenlist_entry_t *current, node_id_t id,
                                 parser_t parsers[]);
 #endif // INCLUDE_PARSER_COMBINATORS_H_
--- a/src/parser/parser.c
+++ b/src/parser/parser.c
@@ -0,0 +1,137 @@
 #include "parser.h"
 #include "../ast.h"
 #include "../lexer.h"
 #include "../tokenlist.h"
 #include "combinators.h"
 #include "primitives.h"
 #include "util.h"
 parse_result_t parse_number(tokenlist_entry_t *current) {
    parser_t parsers[] = {parse_octal, parse_decimal, parse_hexadecimal,
                          parse_binary, nullptr};
    return parse_any(current, parsers);
 }
 parse_result_t parse_plus_or_minus(tokenlist_entry_t *current) {
    parser_t parsers[] = {parse_plus, parse_minus, nullptr};
    return parse_any(current, parsers);
 }
 parse_result_t parse_register_index(tokenlist_entry_t *current) {
    parser_t parsers[] = {parse_plus, parse_register, parse_asterisk,
                          parse_number, nullptr};
    return parse_consecutive(current, NODE_LABEL, parsers);
 }
 parse_result_t parse_register_offset(tokenlist_entry_t *current) {
    parser_t parsers[] = {parse_plus_or_minus, parse_number, nullptr};
    return parse_consecutive(current, NODE_LABEL, parsers);
 }
 parse_result_t parse_register_expression(tokenlist_entry_t *current) {
    parse_result_t result;
    ast_node_t *expr;
    error_t *err = ast_node_alloc(&expr);
    if (err)
        return parse_error(err);
    // <register>
    result = parse_register(current);
    if (result.err) {
        ast_node_free(expr);
        return result;
    }
    err = ast_node_add_child(expr, result.node);
    if (err) {
        ast_node_free(result.node);
        ast_node_free(expr);
        return parse_error(err);
    }
    current = result.next;
    // <register_index>?
    result = parse_register_index(current);
    if (result.err) {
        error_free(result.err);
    } else {
        err = ast_node_add_child(expr, result.node);
        if (err) {
            ast_node_free(result.node);
            ast_node_free(expr);
            return parse_error(err);
        }
        current = result.next;
    }
    // <register_offset>?
    result = parse_register_offset(current);
    if (result.err) {
        error_free(result.err);
    } else {
        err = ast_node_add_child(expr, result.node);
        if (err) {
            ast_node_free(result.node);
            ast_node_free(expr);
            return parse_error(err);
        }
        current = result.next;
    }
    return parse_success(expr, current);
 }
 parse_result_t parse_immediate(tokenlist_entry_t *current) {
    parser_t parsers[] = {parse_number, parse_identifier, nullptr};
    return parse_any(current, parsers);
 }
 parse_result_t parse_memory_expression(tokenlist_entry_t *current) {
    parser_t parsers[] = {parse_register_expression, parse_identifier, nullptr};
    return parse_any(current, parsers);
 }
 parse_result_t parse_memory(tokenlist_entry_t *current) {
    parser_t parsers[] = {parse_lbracket, parse_memory_expression,
                          parse_rbracket, nullptr};
    return parse_consecutive(current, NODE_LABEL, parsers);
 }
 parse_result_t parse_operand(tokenlist_entry_t *current) {
    parser_t parsers[] = {parse_register, parse_memory, parse_immediate,
                          nullptr};
    return parse_any(current, parsers);
 }
 parse_result_t parse_operands(tokenlist_entry_t *current) {
    return parse_list(current, NODE_OPERANDS, true, TOKEN_COMMA, parse_operand);
 }
 parse_result_t parse_label(tokenlist_entry_t *current) {
    parser_t parsers[] = {parse_identifier, parse_colon, nullptr};
    return parse_consecutive(current, NODE_LABEL, parsers);
 }
 parse_result_t parse_section_directive(tokenlist_entry_t *current) {
    parser_t parsers[] = {parse_section, parse_identifier, nullptr};
    return parse_consecutive(current, NODE_SECTION_DIRECTIVE, parsers);
 }
 parse_result_t parse_directive(tokenlist_entry_t *current) {
    parser_t parsers[] = {parse_dot, parse_section_directive, nullptr};
    return parse_consecutive(current, NODE_DIRECTIVE, parsers);
 }
 parse_result_t parse_instruction(tokenlist_entry_t *current) {
    parser_t parsers[] = {parse_identifier, parse_operands, nullptr};
    return parse_consecutive(current, NODE_INSTRUCTION, parsers);
 }
 parse_result_t parse_statement(tokenlist_entry_t *current) {
    parser_t parsers[] = {parse_label, parse_directive, parse_instruction,
                          nullptr};
    return parse_any(current, parsers);
 }
 parse_result_t parse(tokenlist_entry_t *current) {
    return parse_many(current, NODE_PROGRAM, true, parse_statement);
 }
--- a/src/parser/parser.h
+++ b/src/parser/parser.h
@@ -0,0 +1,9 @@
 #ifndef INCLUDE_PARSER_PARSER_H_
 #define INCLUDE_PARSER_PARSER_H_
 #include "../tokenlist.h"
 #include "util.h"
 parse_result_t parse(tokenlist_entry_t *current);
 #endif // INCLUDE_PARSER_PARSER_H_
--- a/src/parser/primitives.c
+++ b/src/parser/primitives.c
@@ -0,0 +1,97 @@
 #include "primitives.h"
 #include "../ast.h"
 #include <string.h>
 parse_result_t parse_identifier(tokenlist_entry_t *current) {
    return parse_token(current, TOKEN_IDENTIFIER, NODE_IDENTIFIER, nullptr);
 }
 parse_result_t parse_decimal(tokenlist_entry_t *current) {
    return parse_token(current, TOKEN_DECIMAL, NODE_DECIMAL, nullptr);
 }
 parse_result_t parse_hexadecimal(tokenlist_entry_t *current) {
    return parse_token(current, TOKEN_HEXADECIMAL, NODE_HEXADECIMAL, nullptr);
 }
 parse_result_t parse_binary(tokenlist_entry_t *current) {
    return parse_token(current, TOKEN_BINARY, NODE_BINARY, nullptr);
 }
 parse_result_t parse_octal(tokenlist_entry_t *current) {
    return parse_token(current, TOKEN_OCTAL, NODE_OCTAL, nullptr);
 }
 parse_result_t parse_string(tokenlist_entry_t *current) {
    return parse_token(current, TOKEN_STRING, NODE_STRING, nullptr);
 }
 parse_result_t parse_char(tokenlist_entry_t *current) {
    return parse_token(current, TOKEN_CHAR, NODE_CHAR, nullptr);
 }
 parse_result_t parse_colon(tokenlist_entry_t *current) {
    return parse_token(current, TOKEN_COLON, NODE_COLON, nullptr);
 }
 parse_result_t parse_comma(tokenlist_entry_t *current) {
    return parse_token(current, TOKEN_COMMA, NODE_COMMA, nullptr);
 }
 parse_result_t parse_lbracket(tokenlist_entry_t *current) {
    return parse_token(current, TOKEN_LBRACKET, NODE_LBRACKET, nullptr);
 }
 parse_result_t parse_rbracket(tokenlist_entry_t *current) {
    return parse_token(current, TOKEN_RBRACKET, NODE_RBRACKET, nullptr);
 }
 parse_result_t parse_plus(tokenlist_entry_t *current) {
    return parse_token(current, TOKEN_PLUS, NODE_PLUS, nullptr);
 }
 parse_result_t parse_minus(tokenlist_entry_t *current) {
    return parse_token(current, TOKEN_MINUS, NODE_MINUS, nullptr);
 }
 parse_result_t parse_asterisk(tokenlist_entry_t *current) {
    return parse_token(current, TOKEN_ASTERISK, NODE_ASTERISK, nullptr);
 }
 parse_result_t parse_dot(tokenlist_entry_t *current) {
    return parse_token(current, TOKEN_DOT, NODE_DOT, nullptr);
 }
 const char *registers[] = {
    // 64-bit registers
    "rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi", "r8", "r9", "r10",
    "r11", "r12", "r13", "r14", "r15",
    // 32-bit registers
    "eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi", "r8d", "r9d",
    "r10d", "r11d", "r12d", "r13d", "r14d", "r15d",
    // 16-bit registers
    "ax", "cx", "dx", "bx", "sp", "bp", "si", "di", "r8w", "r9w", "r10w",
    "r11w", "r12w", "r13w", "r14w", "r15w",
    // 8-bit low registers
    "al", "cl", "dl", "bl", "spl", "bpl", "sil", "dil", "r8b", "r9b", "r10b",
    "r11b", "r12b", "r13b", "r14b", "r15b", nullptr};
 bool is_register_token(lexer_token_t *token) {
    for (size_t i = 0; registers[i] != nullptr; ++i)
        if (strcmp(token->value, registers[i]) == 0)
            return true;
    return false;
 }
 parse_result_t parse_register(tokenlist_entry_t *current) {
    return parse_token(current, TOKEN_IDENTIFIER, NODE_REGISTER,
                       is_register_token);
 }
 bool is_section_token(lexer_token_t *token) {
    return strcmp(token->value, "section") == 0;
 }
 parse_result_t parse_section(tokenlist_entry_t *current) {
    return parse_token(current, TOKEN_IDENTIFIER, NODE_SECTION,
                       is_section_token);
 }
--- a/src/parser/primitives.h
+++ b/src/parser/primitives.h
@@ -0,0 +1,29 @@
 #ifndef INCLUDE_PARSER_PRIMITIVES_H_
 #define INCLUDE_PARSER_PRIMITIVES_H_
 #include "util.h"
 parse_result_t parse_identifier(tokenlist_entry_t *current);
 parse_result_t parse_decimal(tokenlist_entry_t *current);
 parse_result_t parse_hexadecimal(tokenlist_entry_t *current);
 parse_result_t parse_binary(tokenlist_entry_t *current);
 parse_result_t parse_octal(tokenlist_entry_t *current);
 parse_result_t parse_string(tokenlist_entry_t *current);
 parse_result_t parse_char(tokenlist_entry_t *current);
 parse_result_t parse_colon(tokenlist_entry_t *current);
 parse_result_t parse_comma(tokenlist_entry_t *current);
 parse_result_t parse_lbracket(tokenlist_entry_t *current);
 parse_result_t parse_rbracket(tokenlist_entry_t *current);
 parse_result_t parse_plus(tokenlist_entry_t *current);
 parse_result_t parse_minus(tokenlist_entry_t *current);
 parse_result_t parse_asterisk(tokenlist_entry_t *current);
 parse_result_t parse_dot(tokenlist_entry_t *current);
 /* These are "primitives" with a different name and some extra validation on top
 * for example, register is just an identifier but it only matches a limited set
 * of values
 */
 parse_result_t parse_register(tokenlist_entry_t *current);
 parse_result_t parse_section(tokenlist_entry_t *current);
 #endif // INCLUDE_PARSER_PRIMITIVES_H_
--- a/src/parser/util.c
+++ b/src/parser/util.c
@@ -0,0 +1,35 @@
 #include "util.h"
 #include "../tokenlist.h"
 error_t *err_parse_no_match =
    &(error_t){.message = "parsing failed to find the correct token sequence"};
 parse_result_t parse_error(error_t *err) {
    return (parse_result_t){.err = err};
 }
 parse_result_t parse_no_match() {
    return parse_error(err_parse_no_match);
 }
 parse_result_t parse_success(ast_node_t *ast, tokenlist_entry_t *next) {
    next = tokenlist_skip_trivia(next);
    return (parse_result_t){.node = ast, .next = next};
 }
 parse_result_t parse_token(tokenlist_entry_t *current,
                           lexer_token_id_t token_id, node_id_t ast_id,
                           token_validator_t is_valid) {
    if (current->token.id != token_id ||
        (is_valid && !is_valid(&current->token)))
        return parse_no_match();
    ast_node_t *node;
    error_t *err = ast_node_alloc(&node);
    if (err)
        return parse_error(err);
    node->id = ast_id;
    node->token_entry = current;
    return parse_success(node, current->next);
 }
--- a/src/parser/util.h
+++ b/src/parser/util.h
@@ -0,0 +1,27 @@
 #ifndef INCLUDE_PARSER_UTIL_H_
 #define INCLUDE_PARSER_UTIL_H_
 #include "../ast.h"
 #include "../error.h"
 #include "../tokenlist.h"
 typedef struct parse_result {
    error_t *err;
    tokenlist_entry_t *next;
    ast_node_t *node;
 } parse_result_t;
 typedef bool (*token_validator_t)(lexer_token_t *);
 parse_result_t parse_error(error_t *err);
 parse_result_t parse_no_match();
 parse_result_t parse_success(ast_node_t *ast, tokenlist_entry_t *next);
 parse_result_t parse_token(tokenlist_entry_t *current,
                           lexer_token_id_t token_id, node_id_t ast_id,
                           token_validator_t is_valid);
 tokenlist_entry_t *skip_insignificant(tokenlist_entry_t *);
 extern error_t *err_parse_no_match;
 #endif // INCLUDE_PARSER_UTIL_H_
--- a/tests/input/valid.asm
+++ b/tests/input/valid.asm
@@ -1,4 +1,7 @@
 .section text
 _start:
    mov eax, ebx
    mov eax, 555            ; move 555 into eax
    push 0o777
    xor eax, 0xDEADBEEF
--- a/validate.sh
+++ b/validate.sh
@@ -10,7 +10,7 @@ scan-build -o reports/static-analysis/ -plist-html --status-bugs make all
 # Run the sanitizer builds and valgrind
 make clean sanitize all
-ARGUMENTS=("-tokens" "-text")
+ARGUMENTS=("tokens" "text" "ast")
 while IFS= read -r INPUT_FILE; do
    for ARGS in ${ARGUMENTS[@]}; do
        ./oas-asan $ARGS $INPUT_FILE > /dev/null
Author	SHA1	Message	Date
omicron	d40273b329	Fix parse_operand All checks were successful Validate the build / validate-build (push) Successful in 26s Details	2025-04-02 15:49:49 +02:00
omicron	000756fca9	More grammar All checks were successful Validate the build / validate-build (push) Successful in 27s Details	2025-04-02 15:39:41 +02:00
omicron	dabd3fd86f	Slightly change the valid test input file All checks were successful Validate the build / validate-build (push) Successful in 27s Details	2025-04-02 13:00:06 +02:00
omicron	ccf8f52b6e	add functionality to main to parse and print the ast	2025-04-02 13:00:01 +02:00
omicron	35c471f8d4	Partial parser implementation	2025-04-02 12:59:55 +02:00
omicron	44fab4c678	Fix incorrect size comparison in lexer_consume_n The buffer length len and the requested number of tokens n are mixed up in an invalid comparison. This causes all valid requests for n < len tokens to be denied and all invalid requests for n > len tokens to be accepted. This may cause a buffer overflow if the caller requests more characters than they provide space for.	2025-04-02 12:59:55 +02:00
omicron	bcc1569b39	Add a parser combinator to parse a delimited list	2025-04-02 12:59:55 +02:00
omicron	5746ef1c5a	Add basic parser combinators	2025-04-02 12:59:51 +02:00
omicron	2cab530eed	Add "primitive" parsers for all the non-trivia tokens in the lexer grammar	2025-04-02 12:59:47 +02:00
omicron	7ac4eac37f	Add basic parser utilities	2025-04-02 12:59:41 +02:00