Compare commits

...

22 Commits

Author SHA1 Message Date
3a737c05d5 Slightly change the valid test input file
All checks were successful
Validate the build / validate-build (push) Successful in 26s
2025-04-02 11:36:33 +02:00
44254614c1 Fix parse_directive grammar rule 2025-04-02 11:36:33 +02:00
6230ade289 Add registers and fix section primitive parser 2025-04-02 11:36:33 +02:00
a436f23601 Fix parser loops in parse_any and parse_consecutive 2025-04-02 11:36:33 +02:00
3e325e4abd Modify main to use the new print ast functionality 2025-04-02 11:36:33 +02:00
c427adbd22 Add more grammar rules to the parser 2025-04-02 11:36:33 +02:00
1bb9425546 Add a parser combinator to parse a delimited list 2025-04-02 11:36:33 +02:00
5c620870c1 make parse_success always skip past trivia in the tokenlist 2025-04-02 11:36:33 +02:00
110a9bc31e TODO: REVIEW ME AND WRITE PROPER MESSAGE
Fix lexer issue where consuming n tokens always fails if there are n
tokens and always succeeds if they aren't n tokens
2025-04-02 11:36:33 +02:00
3af255baeb Use new validator function for parse_token calls on all primitives
Also adds new validated primitives for NODE_SECTION and NODE_REGISTER
2025-04-02 11:36:33 +02:00
d13b6102c1 Fix incorrect error returned in parse_consecutive 2025-04-02 11:36:33 +02:00
4a4523a1f0 Extend parse_token to accept an optional validator function 2025-04-02 11:36:33 +02:00
2733d4fd7e Expose err_parse_no_match in parser_util.h 2025-04-02 11:36:33 +02:00
cbe49b2db5 Add basic parser combinators 2025-04-02 11:36:33 +02:00
b92248ec47 Add "primitive" parsers for all the semantic tokens in the lexer grammar 2025-04-02 11:36:33 +02:00
018bb6ef9a Add basic parser utilities 2025-04-02 11:36:33 +02:00
85fd507004 Add functions to skip over trivia in a tokenlist 2025-04-02 11:36:33 +02:00
0f9e1886cb Fix parse_token to add the correct information to a parse node 2025-04-02 11:36:33 +02:00
d8f3838c50 Partial parser implementation 2025-04-02 11:36:33 +02:00
d3881ac19d FIXME REORDER COMMIT -- Change main so it can parse the ast
FIXME THIS COMMIT NEEDS TO BE REORDERED
FIXME THIS COMMIT NEEDS TO BE REORDERED
FIXME THIS COMMIT NEEDS TO BE REORDERED
FIXME THIS COMMIT NEEDS TO BE REORDERED
2025-04-02 11:36:33 +02:00
e5be1a527e TODO: REVIEW THIS FUNCTION Add function to print AST 2025-04-02 11:36:33 +02:00
935da30257 Add basic AST functionality 2025-04-02 11:35:53 +02:00
17 changed files with 819 additions and 30 deletions

View File

@ -10,7 +10,7 @@ OBJECTS = $(SOURCES:.c=.o)
DEPENDENCIES = $(SOURCES:.c=.d) DEPENDENCIES = $(SOURCES:.c=.d)
TARGET?=oas TARGET?=oas
OUTPUTS=oas oas-asan oas-msan oas-afl OUTPUTS=oas oas-asan oas-msan oas-afl
RUNARGUMENTS?=-tokens tests/input/valid.asm RUNARGUMENTS?=ast tests/input/valid.asm
all: $(TARGET) all: $(TARGET)

207
src/ast.c Normal file
View File

@ -0,0 +1,207 @@
#include "ast.h"
#include "error.h"
#include <assert.h>
#include <string.h>
error_t *err_node_children_cap = &(error_t){
.message = "Failed to increase ast node children, max capacity reached"};
error_t *ast_node_alloc(ast_node_t **output) {
*output = nullptr;
ast_node_t *node = calloc(1, sizeof(ast_node_t));
if (node == nullptr)
return err_allocation_failed;
*output = node;
return nullptr;
}
void ast_node_free_value(ast_node_t *node) {
// TODO: decide how value ownership will work and clean it up here
}
void ast_node_free(ast_node_t *node) {
if (node == nullptr)
return;
if (node->children) {
for (size_t i = 0; i < node->len; ++i)
ast_node_free(node->children[i]);
free(node->children);
}
ast_node_free_value(node);
memset(node, 0, sizeof(ast_node_t));
free(node);
}
/**
* @pre node->children must be nullptr
*/
error_t *ast_node_alloc_children(ast_node_t *node) {
node->children = calloc(node_default_children_cap, sizeof(ast_node_t *));
if (node->children == nullptr)
return err_allocation_failed;
node->cap = node_default_children_cap;
return nullptr;
}
error_t *ast_node_grow_cap(ast_node_t *node) {
if (node->cap >= node_max_children_cap) {
return err_node_children_cap;
}
size_t new_cap = node->cap * 2;
if (new_cap > node_max_children_cap) {
new_cap = node_max_children_cap;
}
ast_node_t **new_children =
realloc(node->children, new_cap * sizeof(ast_node_t *));
if (new_children == nullptr) {
return err_allocation_failed;
}
node->children = new_children;
node->cap = new_cap;
return nullptr;
}
error_t *ast_node_add_child(ast_node_t *node, ast_node_t *child) {
error_t *err = nullptr;
if (node->children == nullptr)
err = ast_node_alloc_children(node);
else if (node->len >= node->cap)
err = ast_node_grow_cap(node);
if (err)
return err;
node->children[node->len] = child;
node->len += 1;
return nullptr;
}
const char *ast_node_id_to_cstr(node_id_t id) {
switch (id) {
case NODE_INVALID:
return "NODE_INVALID";
case NODE_PROGRAM:
return "NODE_PROGRAM";
case NODE_STATEMENT:
return "NODE_STATEMENT";
case NODE_LABEL:
return "NODE_LABEL";
case NODE_DIRECTIVE:
return "NODE_DIRECTIVE";
case NODE_INSTRUCTION:
return "NODE_INSTRUCTION";
case NODE_OPERANDS:
return "NODE_OPERANDS";
case NODE_OPERAND:
return "NODE_OPERAND";
case NODE_IMMEDIATE:
return "NODE_IMMEDIATE";
case NODE_MEMORY:
return "NODE_MEMORY";
case NODE_NUMBER:
return "NODE_NUMBER";
case NODE_LABEL_REFERENCE:
return "NODE_LABEL_REFERENCE";
case NODE_MEMORY_EXPRESSION:
return "NODE_MEMORY_EXPRESSION";
case NODE_REGISTER_EXPRESSION:
return "NODE_REGISTER_EXPRESSION";
case NODE_REGISTER_INDEX:
return "NODE_REGISTER_INDEX";
case NODE_REGISTER_OFFSET:
return "NODE_REGISTER_OFFSET";
case NODE_PLUS_OR_MINUS:
return "NODE_PLUS_OR_MINUS";
case NODE_SECTION_DIRECTIVE:
return "NODE_SECTION_DIRECTIVE";
case NODE_REGISTER:
return "NODE_REGISTER";
case NODE_SECTION:
return "NODE_SECTION";
case NODE_IDENTIFIER:
return "NODE_IDENTIFIER";
case NODE_DECIMAL:
return "NODE_DECIMAL";
case NODE_HEXADECIMAL:
return "NODE_HEXADECIMAL";
case NODE_OCTAL:
return "NODE_OCTAL";
case NODE_BINARY:
return "NODE_BINARY";
case NODE_CHAR:
return "NODE_CHAR";
case NODE_STRING:
return "NODE_STRING";
case NODE_COLON:
return "NODE_COLON";
case NODE_COMMA:
return "NODE_COMMA";
case NODE_LBRACKET:
return "NODE_LBRACKET";
case NODE_RBRACKET:
return "NODE_RBRACKET";
case NODE_PLUS:
return "NODE_PLUS";
case NODE_MINUS:
return "NODE_MINUS";
case NODE_ASTERISK:
return "NODE_ASTERISK";
case NODE_DOT:
return "NODE_DOT";
}
assert(!"Unreachable, weird node id" && id);
__builtin_unreachable();
}
/**
* @brief Helper function to print a single AST node with indentation
*
* @param node The node to print
* @param indent Current indentation level
*/
static void ast_node_print_internal(ast_node_t *node, int indent) {
if (node == NULL) {
return;
}
// Print indentation
for (int i = 0; i < indent; i++) {
printf(" ");
}
// Print node type
printf("%s", ast_node_id_to_cstr(node->id));
// Print token value if available
if (node->token_entry && node->token_entry->token.value) {
printf(" \"%s\"", node->token_entry->token.value);
}
printf("\n");
// Recursively print all children with increased indentation
for (size_t i = 0; i < node->len; i++) {
ast_node_print_internal(node->children[i], indent + 1);
}
}
/**
* @brief Prints an AST starting from the given node
*
* Prints a representation of the AST with indentation to show structure.
* Each node's type is shown, and if a node has an associated token value,
* that value is printed in quotes.
*
* @param node The root node of the AST to print
*/
void ast_node_print(ast_node_t *node) {
ast_node_print_internal(node, 0);
}

112
src/ast.h Normal file
View File

@ -0,0 +1,112 @@
#ifndef INCLUDE_SRC_AST_H_
#define INCLUDE_SRC_AST_H_
#include "error.h"
#include "lexer.h"
#include "tokenlist.h"
#include <stddef.h>
#include <stdint.h>
typedef enum node_id {
NODE_INVALID,
NODE_PROGRAM,
NODE_STATEMENT,
NODE_LABEL,
NODE_DIRECTIVE,
NODE_INSTRUCTION,
NODE_OPERANDS,
NODE_OPERAND,
NODE_IMMEDIATE,
NODE_MEMORY,
NODE_NUMBER,
NODE_LABEL_REFERENCE,
NODE_MEMORY_EXPRESSION,
NODE_REGISTER_EXPRESSION,
NODE_REGISTER_INDEX,
NODE_REGISTER_OFFSET,
NODE_PLUS_OR_MINUS,
NODE_SECTION_DIRECTIVE,
// Validated primitives
NODE_REGISTER,
NODE_SECTION,
// Primitive nodes
NODE_IDENTIFIER,
NODE_DECIMAL,
NODE_HEXADECIMAL,
NODE_OCTAL,
NODE_BINARY,
NODE_CHAR,
NODE_STRING,
NODE_COLON,
NODE_COMMA,
NODE_LBRACKET,
NODE_RBRACKET,
NODE_PLUS,
NODE_MINUS,
NODE_ASTERISK,
NODE_DOT,
} node_id_t;
typedef struct ast_node ast_node_t;
constexpr size_t node_default_children_cap = 8;
/* 65K ought to be enough for anybody */
constexpr size_t node_max_children_cap = 1 << 16;
struct ast_node {
node_id_t id;
tokenlist_entry_t *token_entry;
size_t len;
size_t cap;
ast_node_t **children;
union {
struct {
uint64_t value;
int size;
} integer;
char *name;
} value;
};
/**
* @brief Allocates a new AST node
*
* Creates and initializes a new AST node with default (zero) values.
*
* @param[out] output Pointer to store the allocated node
* @return error_t* nullptr on success, allocation error on failure
*/
error_t *ast_node_alloc(ast_node_t **node);
/**
* @brief Frees an AST node and all its children recursively
*
* Recursively frees all children of the node, then frees the node itself.
* If node is nullptr, the function returns without doing anything.
*
* @param node The node to free
*/
void ast_node_free(ast_node_t *node);
/**
* @brief Adds a child node to a parent node
*
* Adds the specified child node to the parent's children array.
* If this is the first child, the function allocates the children array.
* If the children array is full, the function increases its capacity.
*
* @param node The parent node to add the child to
* @param child The child node to add
* @return error_t* nullptr on success, allocation error on failure,
* or err_node_children_cap if maximum capacity is reached
*/
error_t *ast_node_add_child(ast_node_t *node, ast_node_t *child);
const char *ast_node_id_to_cstr(node_id_t id);
void ast_node_print(ast_node_t *node);
#endif // INCLUDE_SRC_AST_H_

View File

@ -183,7 +183,7 @@ error_t *lexer_consume_n(lexer_t *lex, const size_t len,
char buffer[static len], const size_t n) { char buffer[static len], const size_t n) {
if (lex->buffer_count < n) if (lex->buffer_count < n)
return err_buffer_underrun; return err_buffer_underrun;
if (len > n) if (n > len)
return err_consume_excessive_length; return err_consume_excessive_length;
memcpy(buffer, lex->buffer, n); memcpy(buffer, lex->buffer, n);

View File

@ -1,5 +1,6 @@
#include "error.h" #include "error.h"
#include "lexer.h" #include "lexer.h"
#include "parser.h"
#include "tokenlist.h" #include "tokenlist.h"
#include <limits.h> #include <limits.h>
@ -7,38 +8,64 @@
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
bool print_token(lexer_token_t *token) { typedef enum mode { MODE_AST, MODE_TEXT, MODE_TOKENS } mode_t;
lexer_token_print(token);
return true; void print_tokens(tokenlist_t *list) {
for (auto entry = list->head; entry; entry = entry->next) {
auto token = &entry->token;
lexer_token_print(token);
}
} }
bool print_value(lexer_token_t *token) { void print_text(tokenlist_t *list) {
if (token->id == TOKEN_ERROR) { for (auto entry = list->head; entry; entry = entry->next) {
printf("%s\n", token->value); auto token = &entry->token;
for (size_t i = 0; i < token->character_number; ++i) if (token->id == TOKEN_ERROR) {
printf(" "); printf("%s\n", token->value);
printf("^-- %s\n", token->explanation); for (size_t i = 0; i < token->character_number; ++i)
} else { printf(" ");
printf("%s", token->value); printf("^-- %s\n", token->explanation);
return;
} else {
printf("%s", token->value);
}
} }
return token->id != TOKEN_ERROR; }
void print_ast(tokenlist_t *list) {
parse_result_t result = parse(list->head);
if (result.err) {
puts(result.err->message);
error_free(result.err);
return;
}
ast_node_print(result.node);
if (result.next != nullptr) {
puts("First unparsed token:");
lexer_token_print(&result.next->token);
}
ast_node_free(result.node);
}
int get_execution_mode(int argc, char *argv[]) {
if (argc != 3 || (strcmp(argv[1], "tokens") != 0 &&
strcmp(argv[1], "text") != 0 && strcmp(argv[1], "ast"))) {
puts("Usage: oas [tokens|text|ast] <filename>");
exit(1);
}
if (strcmp(argv[1], "tokens") == 0)
return MODE_TOKENS;
if (strcmp(argv[1], "text") == 0)
return MODE_TEXT;
return MODE_AST;
} }
int main(int argc, char *argv[]) { int main(int argc, char *argv[]) {
if (argc != 3 || mode_t mode = get_execution_mode(argc, argv);
(strcmp(argv[1], "-tokens") != 0 && strcmp(argv[1], "-text") != 0)) {
puts("Usage: oas -tokens <filename>");
puts("Usage: oas -text <filename>");
return 1;
}
bool (*print_fn)(lexer_token_t *);
char *filename = argv[2]; char *filename = argv[2];
if (strcmp(argv[1], "-tokens") == 0) {
print_fn = print_token;
} else {
print_fn = print_value;
}
lexer_t *lex = &(lexer_t){}; lexer_t *lex = &(lexer_t){};
error_t *err = lexer_open(lex, filename); error_t *err = lexer_open(lex, filename);
@ -54,9 +81,18 @@ int main(int argc, char *argv[]) {
if (err) if (err)
goto cleanup_tokens; goto cleanup_tokens;
for (auto entry = list->head; entry; entry = entry->next) { switch (mode) {
print_fn(&entry->token); case MODE_TOKENS:
print_tokens(list);
break;
case MODE_TEXT:
print_text(list);
break;
case MODE_AST:
print_ast(list);
break;
} }
tokenlist_free(list); tokenlist_free(list);
error_free(err); error_free(err);
return 0; return 0;

53
src/parser.c Normal file
View File

@ -0,0 +1,53 @@
#include "parser.h"
#include "ast.h"
#include "lexer.h"
#include "parser_combinators.h"
#include "parser_primitives.h"
#include "parser_util.h"
#include "tokenlist.h"
parse_result_t parse_number(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_octal, parse_decimal, parse_hexadecimal,
parse_binary, nullptr};
return parse_any(current, parsers);
}
parse_result_t parse_operand(tokenlist_entry_t *current) {
// FIXME: not the correct set of parsers
parser_t parsers[] = {parse_register, parse_number, nullptr};
return parse_any(current, parsers);
}
parse_result_t parse_operands(tokenlist_entry_t *current) {
return parse_list(current, NODE_OPERANDS, true, TOKEN_COMMA, parse_operand);
}
parse_result_t parse_label(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_identifier, parse_colon, nullptr};
return parse_consecutive(current, NODE_LABEL, parsers);
}
parse_result_t parse_section_directive(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_section, parse_identifier, nullptr};
return parse_consecutive(current, NODE_SECTION_DIRECTIVE, parsers);
}
parse_result_t parse_directive(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_dot, parse_section_directive, nullptr};
return parse_consecutive(current, NODE_DIRECTIVE, parsers);
}
parse_result_t parse_instruction(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_identifier, parse_operands, nullptr};
return parse_consecutive(current, NODE_INSTRUCTION, parsers);
}
parse_result_t parse_statement(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_label, parse_directive, parse_instruction,
nullptr};
return parse_any(current, parsers);
}
parse_result_t parse(tokenlist_entry_t *current) {
return parse_many(current, NODE_PROGRAM, true, parse_statement);
}

11
src/parser.h Normal file
View File

@ -0,0 +1,11 @@
#ifndef INCLUDE_SRC_PARSER_H_
#define INCLUDE_SRC_PARSER_H_
#include "ast.h"
#include "error.h"
#include "parser_util.h"
#include "tokenlist.h"
parse_result_t parse(tokenlist_entry_t *current);
#endif // INCLUDE_SRC_PARSER_H_

126
src/parser_combinators.c Normal file
View File

@ -0,0 +1,126 @@
#include "parser_combinators.h"
// Parse a list of the given parser delimited by the given token id. Does not
// store the delimiters in the parent node
parse_result_t parse_list(tokenlist_entry_t *current, node_id_t id,
bool allow_none, lexer_token_id_t delimiter_id,
parser_t parser) {
ast_node_t *many;
error_t *err = ast_node_alloc(&many);
parse_result_t result;
if (err)
return parse_error(err);
many->id = id;
while (current) {
// Skip beyond the delimiter on all but the first iteration
if (many->len > 0) {
if (current->token.id != delimiter_id)
break;
current = tokenlist_next(current);
if (current == nullptr) {
// FIXME: this isn't quite right, we can't consume the delimiter
// if the next element will fail to parse but it's late and I
// must think this through tomorrow
break;
}
}
result = parser(current);
if (result.err == err_parse_no_match)
break;
if (result.err) {
ast_node_free(many);
return result;
}
err = ast_node_add_child(many, result.node);
if (err) {
ast_node_free(many);
ast_node_free(result.node);
return parse_error(err);
}
current = result.next;
}
if (!allow_none && many->len == 0) {
ast_node_free(many);
return parse_no_match();
}
return parse_success(many, current);
}
parse_result_t parse_any(tokenlist_entry_t *current, parser_t parsers[]) {
parser_t parser;
while ((parser = *parsers++)) {
parse_result_t result = parser(current);
if (result.err == nullptr)
return result;
}
return parse_no_match();
}
// parse as many of the giver parsers objects in a row as possible,
// potentially allowing none wraps the found objects in a new ast node with
// the given note id
parse_result_t parse_many(tokenlist_entry_t *current, node_id_t id,
bool allow_none, parser_t parser) {
ast_node_t *many;
error_t *err = ast_node_alloc(&many);
parse_result_t result;
if (err)
return parse_error(err);
many->id = id;
while (current) {
result = parser(current);
if (result.err == err_parse_no_match)
break;
if (result.err) {
ast_node_free(many);
return result;
}
err = ast_node_add_child(many, result.node);
if (err) {
ast_node_free(many);
ast_node_free(result.node);
return parse_error(err);
}
current = result.next;
}
if (!allow_none && many->len == 0) {
ast_node_free(many);
return parse_no_match();
}
return parse_success(many, current);
}
// Parse all tries to parse all parsers consecutively and if it succeeds it
// wraps the parsed nodes in a new parent node.
parse_result_t parse_consecutive(tokenlist_entry_t *current, node_id_t id,
parser_t parsers[]) {
ast_node_t *all;
error_t *err = ast_node_alloc(&all);
parse_result_t result;
if (err)
return parse_error(err);
all->id = id;
parser_t parser;
while ((parser = *parsers++) && current) {
result = parser(current);
if (result.err) {
ast_node_free(all);
return result;
}
err = ast_node_add_child(all, result.node);
if (err) {
ast_node_free(result.node);
ast_node_free(all);
return parse_error(err);
}
current = result.next;
}
return parse_success(all, current);
}

20
src/parser_combinators.h Normal file
View File

@ -0,0 +1,20 @@
#include "parser_util.h"
typedef parse_result_t (*parser_t)(tokenlist_entry_t *);
parse_result_t parse_any(tokenlist_entry_t *current, parser_t parsers[]);
// parse as many of the giver parsers objects in a row as possible, potentially
// allowing none wraps the found objects in a new ast node with the given note
// id
parse_result_t parse_many(tokenlist_entry_t *current, node_id_t id,
bool allow_none, parser_t parser);
parse_result_t parse_list(tokenlist_entry_t *current, node_id_t id,
bool allow_none, lexer_token_id_t delimiter_id,
parser_t parser);
// Parse all tries to parse all parsers consecutively and if it succeeds it
// wraps the parsed nodes in a new parent node.
parse_result_t parse_consecutive(tokenlist_entry_t *current, node_id_t id,
parser_t parsers[]);

97
src/parser_primitives.c Normal file
View File

@ -0,0 +1,97 @@
#include "parser_primitives.h"
#include "ast.h"
#include <string.h>
parse_result_t parse_identifier(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_IDENTIFIER, NODE_IDENTIFIER, nullptr);
}
parse_result_t parse_decimal(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_DECIMAL, NODE_DECIMAL, nullptr);
}
parse_result_t parse_hexadecimal(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_HEXADECIMAL, NODE_HEXADECIMAL, nullptr);
}
parse_result_t parse_binary(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_BINARY, NODE_BINARY, nullptr);
}
parse_result_t parse_octal(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_OCTAL, NODE_OCTAL, nullptr);
}
parse_result_t parse_string(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_STRING, NODE_STRING, nullptr);
}
parse_result_t parse_char(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_CHAR, NODE_CHAR, nullptr);
}
parse_result_t parse_colon(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_COLON, NODE_COLON, nullptr);
}
parse_result_t parse_comma(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_COMMA, NODE_COMMA, nullptr);
}
parse_result_t parse_lbracket(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_LBRACKET, NODE_LBRACKET, nullptr);
}
parse_result_t parse_rbracket(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_RBRACKET, NODE_RBRACKET, nullptr);
}
parse_result_t parse_plus(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_PLUS, NODE_PLUS, nullptr);
}
parse_result_t parse_minus(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_MINUS, NODE_MINUS, nullptr);
}
parse_result_t parse_asterisk(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_ASTERISK, NODE_ASTERISK, nullptr);
}
parse_result_t parse_dot(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_DOT, NODE_DOT, nullptr);
}
const char *registers[] = {
// 64-bit registers
"rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi", "r8", "r9", "r10",
"r11", "r12", "r13", "r14", "r15",
// 32-bit registers
"eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi", "r8d", "r9d",
"r10d", "r11d", "r12d", "r13d", "r14d", "r15d",
// 16-bit registers
"ax", "cx", "dx", "bx", "sp", "bp", "si", "di", "r8w", "r9w", "r10w",
"r11w", "r12w", "r13w", "r14w", "r15w",
// 8-bit low registers
"al", "cl", "dl", "bl", "spl", "bpl", "sil", "dil", "r8b", "r9b", "r10b",
"r11b", "r12b", "r13b", "r14b", "r15b", nullptr};
bool is_register_token(lexer_token_t *token) {
for (size_t i = 0; registers[i] != nullptr; ++i)
if (strcmp(token->value, registers[i]) == 0)
return true;
return false;
}
parse_result_t parse_register(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_IDENTIFIER, NODE_REGISTER,
is_register_token);
}
bool is_section_token(lexer_token_t *token) {
return strcmp(token->value, "section") == 0;
}
parse_result_t parse_section(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_IDENTIFIER, NODE_SECTION,
is_section_token);
}

29
src/parser_primitives.h Normal file
View File

@ -0,0 +1,29 @@
#ifndef INCLUDE_SRC_PARSER_PRIMITIVES_H_
#define INCLUDE_SRC_PARSER_PRIMITIVES_H_
#include "parser_util.h"
parse_result_t parse_identifier(tokenlist_entry_t *current);
parse_result_t parse_decimal(tokenlist_entry_t *current);
parse_result_t parse_hexadecimal(tokenlist_entry_t *current);
parse_result_t parse_binary(tokenlist_entry_t *current);
parse_result_t parse_octal(tokenlist_entry_t *current);
parse_result_t parse_string(tokenlist_entry_t *current);
parse_result_t parse_char(tokenlist_entry_t *current);
parse_result_t parse_colon(tokenlist_entry_t *current);
parse_result_t parse_comma(tokenlist_entry_t *current);
parse_result_t parse_lbracket(tokenlist_entry_t *current);
parse_result_t parse_rbracket(tokenlist_entry_t *current);
parse_result_t parse_plus(tokenlist_entry_t *current);
parse_result_t parse_minus(tokenlist_entry_t *current);
parse_result_t parse_asterisk(tokenlist_entry_t *current);
parse_result_t parse_dot(tokenlist_entry_t *current);
/* These are "primitives" with a different name and some extra validation on top
* for example, register is just an identifier but it only matches a limited set
* of values
*/
parse_result_t parse_register(tokenlist_entry_t *current);
parse_result_t parse_section(tokenlist_entry_t *current);
#endif // INCLUDE_SRC_PARSER_PRIMITIVES_H_

35
src/parser_util.c Normal file
View File

@ -0,0 +1,35 @@
#include "parser_util.h"
#include "tokenlist.h"
error_t *err_parse_no_match =
&(error_t){.message = "parsing failed to find the correct token sequence"};
parse_result_t parse_error(error_t *err) {
return (parse_result_t){.err = err};
}
parse_result_t parse_no_match() {
return parse_error(err_parse_no_match);
}
parse_result_t parse_success(ast_node_t *ast, tokenlist_entry_t *next) {
next = tokenlist_skip_trivia(next);
return (parse_result_t){.node = ast, .next = next};
}
parse_result_t parse_token(tokenlist_entry_t *current,
lexer_token_id_t token_id, node_id_t ast_id,
token_validator_t is_valid) {
if (current->token.id != token_id ||
(is_valid && !is_valid(&current->token)))
return parse_no_match();
ast_node_t *node;
error_t *err = ast_node_alloc(&node);
if (err)
return parse_error(err);
node->id = ast_id;
node->token_entry = current;
return parse_success(node, current->next);
}

27
src/parser_util.h Normal file
View File

@ -0,0 +1,27 @@
#ifndef INCLUDE_SRC_PARSER_UTIL_H_
#define INCLUDE_SRC_PARSER_UTIL_H_
#include "ast.h"
#include "error.h"
#include "tokenlist.h"
typedef struct parse_result {
error_t *err;
tokenlist_entry_t *next;
ast_node_t *node;
} parse_result_t;
typedef bool (*token_validator_t)(lexer_token_t *);
parse_result_t parse_error(error_t *err);
parse_result_t parse_no_match();
parse_result_t parse_success(ast_node_t *ast, tokenlist_entry_t *next);
parse_result_t parse_token(tokenlist_entry_t *current,
lexer_token_id_t token_id, node_id_t ast_id,
token_validator_t is_valid);
tokenlist_entry_t *skip_insignificant(tokenlist_entry_t *);
extern error_t *err_parse_no_match;
#endif // INCLUDE_SRC_PARSER_UTIL_H_

View File

@ -81,3 +81,26 @@ error_t *tokenlist_fill(tokenlist_t *list, lexer_t *lex) {
return err; return err;
return nullptr; return nullptr;
} }
bool is_trivia(tokenlist_entry_t *trivia) {
switch (trivia->token.id) {
case TOKEN_WHITESPACE:
case TOKEN_COMMENT:
case TOKEN_NEWLINE:
return true;
default:
return false;
}
}
tokenlist_entry_t *tokenlist_skip_trivia(tokenlist_entry_t *current) {
while (current && is_trivia(current))
current = current->next;
return current;
}
tokenlist_entry_t *tokenlist_next(tokenlist_entry_t *current) {
if (!current)
return nullptr;
return tokenlist_skip_trivia(current->next);
}

View File

@ -27,4 +27,14 @@ error_t *tokenlist_fill(tokenlist_t *list, lexer_t *lex);
void tokenlist_free(tokenlist_t *list); void tokenlist_free(tokenlist_t *list);
/**
* Return the first token entry that isn't whitespace, newline or comment
*/
tokenlist_entry_t *tokenlist_skip_trivia(tokenlist_entry_t *current);
/**
* Return the next token entry that isn't whitespace, newline or comment
*/
tokenlist_entry_t *tokenlist_next(tokenlist_entry_t *current);
#endif // INCLUDE_SRC_TOKENLIST_H_ #endif // INCLUDE_SRC_TOKENLIST_H_

View File

@ -1,4 +1,7 @@
.section text
_start: _start:
mov eax, ebx
mov eax, 555 ; move 555 into eax mov eax, 555 ; move 555 into eax
push 0o777 push 0o777
xor eax, 0xDEADBEEF xor eax, 0xDEADBEEF

View File

@ -10,7 +10,7 @@ scan-build -o reports/static-analysis/ -plist-html --status-bugs make all
# Run the sanitizer builds and valgrind # Run the sanitizer builds and valgrind
make clean sanitize all make clean sanitize all
ARGUMENTS=("-tokens" "-text") ARGUMENTS=("tokens" "text" "ast")
while IFS= read -r INPUT_FILE; do while IFS= read -r INPUT_FILE; do
for ARGS in ${ARGUMENTS[@]}; do for ARGS in ${ARGUMENTS[@]}; do
./oas-asan $ARGS $INPUT_FILE > /dev/null ./oas-asan $ARGS $INPUT_FILE > /dev/null