implement parsing #3

Merged
omicron merged 15 commits from parser_start into main 2025-04-02 19:53:48 +00:00
20 changed files with 1117 additions and 49 deletions

View File

@ -10,7 +10,7 @@ OBJECTS = $(SOURCES:.c=.o)
DEPENDENCIES = $(SOURCES:.c=.d) DEPENDENCIES = $(SOURCES:.c=.d)
TARGET?=oas TARGET?=oas
OUTPUTS=oas oas-asan oas-msan oas-afl OUTPUTS=oas oas-asan oas-msan oas-afl
RUNARGUMENTS?=-tokens tests/input/valid.asm RUNARGUMENTS?=ast tests/input/valid.asm
all: $(TARGET) all: $(TARGET)

39
doc/parser_grammar.txt Normal file
View File

@ -0,0 +1,39 @@
<program> ::= <statement>*
<statement> ::= <label> | <directive> | <instruction>
<label> ::= <identifier> <colon>
<directive> ::= <dot> <section_directive>
<section_directive> ::= "section" <identifier>
<instruction> ::= <identifier> <operands>
<operands> ::= <operand> ( <comma> <operand> )*
<operand> ::= <register> | <immediate> | <memory>
<immediate> ::= <number> | <label_reference>
<number> ::= <octal> | <binary> | <decimal> | <hexadecimal>
<label_reference> ::= <identifier>
<memory> ::= <lbracket> <memory_expression> <rbracket>
<memory_expression> ::= <label_reference> | <register_expression>
<register_expression> ::= <register> <register_index>? <register_offset>?
<register_index> ::= <plus> <register> <asterisk> <number>
<register_offset> ::= <plus_or_minus> <number>
<plus_or_minus> ::= <plus> | <minus>
/* These are lexer identifiers with the correct string value */
<section> ::= "section"
<register> ::= "rax" | "rbx" | "rcx" | "rdx" | "rsi" | "rdi" | "rbp" | "rsp" |
"r8" | "r9" | "r10" | "r11" | "r12" | "r13" | "r14" | "r15"

187
src/ast.c Normal file
View File

@ -0,0 +1,187 @@
#include "ast.h"
#include "error.h"
#include <assert.h>
#include <string.h>
error_t *err_node_children_cap = &(error_t){
.message = "Failed to increase ast node children, max capacity reached"};
error_t *ast_node_alloc(ast_node_t **output) {
*output = nullptr;
ast_node_t *node = calloc(1, sizeof(ast_node_t));
if (node == nullptr)
return err_allocation_failed;
*output = node;
return nullptr;
}
void ast_node_free_value(ast_node_t *node) {
// TODO: decide how value ownership will work and clean it up here
}
void ast_node_free(ast_node_t *node) {
if (node == nullptr)
return;
if (node->children) {
for (size_t i = 0; i < node->len; ++i)
ast_node_free(node->children[i]);
free(node->children);
}
ast_node_free_value(node);
memset(node, 0, sizeof(ast_node_t));
free(node);
}
/**
* @pre node->children must be nullptr
*/
error_t *ast_node_alloc_children(ast_node_t *node) {
node->children = calloc(node_default_children_cap, sizeof(ast_node_t *));
if (node->children == nullptr)
return err_allocation_failed;
node->cap = node_default_children_cap;
return nullptr;
}
error_t *ast_node_grow_cap(ast_node_t *node) {
if (node->cap >= node_max_children_cap) {
return err_node_children_cap;
}
size_t new_cap = node->cap * 2;
if (new_cap > node_max_children_cap) {
new_cap = node_max_children_cap;
}
ast_node_t **new_children =
realloc(node->children, new_cap * sizeof(ast_node_t *));
if (new_children == nullptr) {
return err_allocation_failed;
}
node->children = new_children;
node->cap = new_cap;
return nullptr;
}
error_t *ast_node_add_child(ast_node_t *node, ast_node_t *child) {
error_t *err = nullptr;
if (node->children == nullptr)
err = ast_node_alloc_children(node);
else if (node->len >= node->cap)
err = ast_node_grow_cap(node);
if (err)
return err;
node->children[node->len] = child;
node->len += 1;
return nullptr;
}
const char *ast_node_id_to_cstr(node_id_t id) {
switch (id) {
case NODE_INVALID:
return "NODE_INVALID";
case NODE_PROGRAM:
return "NODE_PROGRAM";
case NODE_STATEMENT:
return "NODE_STATEMENT";
case NODE_LABEL:
return "NODE_LABEL";
case NODE_DIRECTIVE:
return "NODE_DIRECTIVE";
case NODE_INSTRUCTION:
return "NODE_INSTRUCTION";
case NODE_OPERANDS:
return "NODE_OPERANDS";
case NODE_OPERAND:
return "NODE_OPERAND";
case NODE_IMMEDIATE:
return "NODE_IMMEDIATE";
case NODE_MEMORY:
return "NODE_MEMORY";
case NODE_NUMBER:
return "NODE_NUMBER";
case NODE_LABEL_REFERENCE:
return "NODE_LABEL_REFERENCE";
case NODE_MEMORY_EXPRESSION:
return "NODE_MEMORY_EXPRESSION";
case NODE_REGISTER_EXPRESSION:
return "NODE_REGISTER_EXPRESSION";
case NODE_REGISTER_INDEX:
return "NODE_REGISTER_INDEX";
case NODE_REGISTER_OFFSET:
return "NODE_REGISTER_OFFSET";
case NODE_PLUS_OR_MINUS:
return "NODE_PLUS_OR_MINUS";
case NODE_SECTION_DIRECTIVE:
return "NODE_SECTION_DIRECTIVE";
case NODE_REGISTER:
return "NODE_REGISTER";
case NODE_SECTION:
return "NODE_SECTION";
case NODE_IDENTIFIER:
return "NODE_IDENTIFIER";
case NODE_DECIMAL:
return "NODE_DECIMAL";
case NODE_HEXADECIMAL:
return "NODE_HEXADECIMAL";
case NODE_OCTAL:
return "NODE_OCTAL";
case NODE_BINARY:
return "NODE_BINARY";
case NODE_CHAR:
return "NODE_CHAR";
case NODE_STRING:
return "NODE_STRING";
case NODE_COLON:
return "NODE_COLON";
case NODE_COMMA:
return "NODE_COMMA";
case NODE_LBRACKET:
return "NODE_LBRACKET";
case NODE_RBRACKET:
return "NODE_RBRACKET";
case NODE_PLUS:
return "NODE_PLUS";
case NODE_MINUS:
return "NODE_MINUS";
case NODE_ASTERISK:
return "NODE_ASTERISK";
case NODE_DOT:
return "NODE_DOT";
}
assert(!"Unreachable, weird node id" && id);
__builtin_unreachable();
}
static void ast_node_print_internal(ast_node_t *node, int indent) {
if (node == NULL) {
return;
}
for (int i = 0; i < indent; i++) {
printf(" ");
}
printf("%s", ast_node_id_to_cstr(node->id));
if (node->token_entry && node->token_entry->token.value) {
printf(" \"%s\"", node->token_entry->token.value);
}
printf("\n");
for (size_t i = 0; i < node->len; i++) {
ast_node_print_internal(node->children[i], indent + 1);
}
}
void ast_node_print(ast_node_t *node) {
ast_node_print_internal(node, 0);
}

120
src/ast.h Normal file
View File

@ -0,0 +1,120 @@
#ifndef INCLUDE_SRC_AST_H_
#define INCLUDE_SRC_AST_H_
#include "error.h"
#include "lexer.h"
#include "tokenlist.h"
#include <stddef.h>
#include <stdint.h>
typedef enum node_id {
NODE_INVALID,
NODE_PROGRAM,
NODE_STATEMENT,
NODE_LABEL,
NODE_DIRECTIVE,
NODE_INSTRUCTION,
NODE_OPERANDS,
NODE_OPERAND,
NODE_IMMEDIATE,
NODE_MEMORY,
NODE_NUMBER,
NODE_LABEL_REFERENCE,
NODE_MEMORY_EXPRESSION,
NODE_REGISTER_EXPRESSION,
NODE_REGISTER_INDEX,
NODE_REGISTER_OFFSET,
NODE_PLUS_OR_MINUS,
NODE_SECTION_DIRECTIVE,
// Validated primitives
NODE_REGISTER,
NODE_SECTION,
// Primitive nodes
NODE_IDENTIFIER,
NODE_DECIMAL,
NODE_HEXADECIMAL,
NODE_OCTAL,
NODE_BINARY,
NODE_CHAR,
NODE_STRING,
NODE_COLON,
NODE_COMMA,
NODE_LBRACKET,
NODE_RBRACKET,
NODE_PLUS,
NODE_MINUS,
NODE_ASTERISK,
NODE_DOT,
} node_id_t;
typedef struct ast_node ast_node_t;
constexpr size_t node_default_children_cap = 8;
/* 65K ought to be enough for anybody */
constexpr size_t node_max_children_cap = 1 << 16;
struct ast_node {
node_id_t id;
tokenlist_entry_t *token_entry;
size_t len;
size_t cap;
ast_node_t **children;
union {
struct {
uint64_t value;
int size;
} integer;
char *name;
} value;
};
/**
* @brief Allocates a new AST node
*
* Creates and initializes a new AST node with default (zero) values.
*
* @param[out] output Pointer to store the allocated node
* @return error_t* nullptr on success, allocation error on failure
*/
error_t *ast_node_alloc(ast_node_t **node);
/**
* @brief Frees an AST node and all its children recursively
*
* Recursively frees all children of the node, then frees the node itself.
* If node is nullptr, the function returns without doing anything.
*
* @param node The node to free
*/
void ast_node_free(ast_node_t *node);
/**
* @brief Adds a child node to a parent node
*
* Adds the specified child node to the parent's children array.
* If this is the first child, the function allocates the children array.
* If the children array is full, the function increases its capacity.
*
* @param node The parent node to add the child to
* @param child The child node to add
* @return error_t* nullptr on success, allocation error on failure,
* or err_node_children_cap if maximum capacity is reached
*/
error_t *ast_node_add_child(ast_node_t *node, ast_node_t *child);
/**
* @brief Prints an AST starting from the given node
*
* Prints a representation of the AST with indentation to show structure.
* Each node's type is shown, and if a node has an associated token value,
* that value is printed in quotes.
*
* @param node The root node of the AST to print
*/
void ast_node_print(ast_node_t *node);
#endif // INCLUDE_SRC_AST_H_

View File

@ -10,6 +10,9 @@ error_t *const err_errorf_length = &(error_t){
.message = .message =
"Formatting of another error failed to determine the error length"}; "Formatting of another error failed to determine the error length"};
error_t *err_allocation_failed =
&(error_t){.message = "Memory allocation failed"};
error_t *errorf(const char *fmt, ...) { error_t *errorf(const char *fmt, ...) {
error_t *err = calloc(1, sizeof(error_t)); error_t *err = calloc(1, sizeof(error_t));
if (err == nullptr) if (err == nullptr)

View File

@ -18,4 +18,7 @@ static inline void error_free(error_t *err) {
free(err); free(err);
} }
/* Some global errors */
extern error_t *err_allocation_failed;
#endif // INCLUDE_SRC_ERROR_H_ #endif // INCLUDE_SRC_ERROR_H_

View File

@ -20,9 +20,6 @@ error_t *err_eof =
error_t *err_unknown_read = &(error_t){.message = "Unknown read error"}; error_t *err_unknown_read = &(error_t){.message = "Unknown read error"};
error_t *err_allocation_failed =
&(error_t){.message = "Memory allocation failed"};
typedef bool (*char_predicate_t)(char); typedef bool (*char_predicate_t)(char);
const char *lexer_token_id_to_cstr(lexer_token_id_t id) { const char *lexer_token_id_to_cstr(lexer_token_id_t id) {
@ -186,7 +183,7 @@ error_t *lexer_consume_n(lexer_t *lex, const size_t len,
char buffer[static len], const size_t n) { char buffer[static len], const size_t n) {
if (lex->buffer_count < n) if (lex->buffer_count < n)
return err_buffer_underrun; return err_buffer_underrun;
if (len > n) if (n > len)
return err_consume_excessive_length; return err_consume_excessive_length;
memcpy(buffer, lex->buffer, n); memcpy(buffer, lex->buffer, n);

View File

@ -1,62 +1,108 @@
#include "error.h" #include "error.h"
#include "lexer.h" #include "lexer.h"
#include "parser/parser.h"
#include "tokenlist.h"
#include <limits.h> #include <limits.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
bool print_token(lexer_token_t *token) { typedef enum mode { MODE_AST, MODE_TEXT, MODE_TOKENS } mode_t;
lexer_token_print(token);
return true; void print_tokens(tokenlist_t *list) {
for (auto entry = list->head; entry; entry = entry->next) {
auto token = &entry->token;
lexer_token_print(token);
}
} }
bool print_value(lexer_token_t *token) { void print_text(tokenlist_t *list) {
if (token->id == TOKEN_ERROR) { for (auto entry = list->head; entry; entry = entry->next) {
printf("%s\n", token->value); auto token = &entry->token;
for (size_t i = 0; i < token->character_number; ++i) if (token->id == TOKEN_ERROR) {
printf(" "); printf("%s\n", token->value);
printf("^-- %s\n", token->explanation); for (size_t i = 0; i < token->character_number; ++i)
} else { printf(" ");
printf("%s", token->value); printf("^-- %s\n", token->explanation);
return;
} else {
printf("%s", token->value);
}
} }
return token->id != TOKEN_ERROR; }
void print_ast(tokenlist_t *list) {
parse_result_t result = parse(list->head);
if (result.err) {
puts(result.err->message);
error_free(result.err);
return;
}
ast_node_print(result.node);
if (result.next != nullptr) {
puts("First unparsed token:");
lexer_token_print(&result.next->token);
}
ast_node_free(result.node);
}
int get_execution_mode(int argc, char *argv[]) {
if (argc != 3 || (strcmp(argv[1], "tokens") != 0 &&
strcmp(argv[1], "text") != 0 && strcmp(argv[1], "ast"))) {
puts("Usage: oas [tokens|text|ast] <filename>");
exit(1);
}
if (strcmp(argv[1], "tokens") == 0)
return MODE_TOKENS;
if (strcmp(argv[1], "text") == 0)
return MODE_TEXT;
return MODE_AST;
} }
int main(int argc, char *argv[]) { int main(int argc, char *argv[]) {
if (argc != 3 || mode_t mode = get_execution_mode(argc, argv);
(strcmp(argv[1], "-tokens") != 0 && strcmp(argv[1], "-text") != 0)) {
puts("Usage: oas -tokens <filename>");
puts("Usage: oas -text <filename>");
return 1;
}
bool (*print_fn)(lexer_token_t *);
char *filename = argv[2]; char *filename = argv[2];
if (strcmp(argv[1], "-tokens") == 0) {
print_fn = print_token; lexer_t *lex = &(lexer_t){};
} else { error_t *err = lexer_open(lex, filename);
print_fn = print_value; if (err)
goto cleanup_error;
tokenlist_t *list;
err = tokenlist_alloc(&list);
if (err)
goto cleanup_lexer;
err = tokenlist_fill(list, lex);
if (err)
goto cleanup_tokens;
switch (mode) {
case MODE_TOKENS:
print_tokens(list);
break;
case MODE_TEXT:
print_text(list);
break;
case MODE_AST:
print_ast(list);
break;
} }
lexer_t lex = {0}; tokenlist_free(list);
lexer_token_t token;
error_t *err = lexer_open(&lex, filename);
if (err) {
puts(err->message);
error_free(err);
return 1;
}
bool keep_going = true;
while (keep_going && (err = lexer_next(&lex, &token)) == nullptr) {
keep_going = print_fn(&token);
free(token.value);
}
if (err && err != err_eof) {
puts(err->message);
}
error_free(err); error_free(err);
return 0; return 0;
cleanup_tokens:
tokenlist_free(list);
cleanup_lexer:
lexer_close(lex);
cleanup_error:
puts(err->message);
error_free(err);
return 1;
} }

126
src/parser/combinators.c Normal file
View File

@ -0,0 +1,126 @@
#include "combinators.h"
// Parse a list of the given parser delimited by the given token id. Does not
// store the delimiters in the parent node
parse_result_t parse_list(tokenlist_entry_t *current, node_id_t id,
bool allow_none, lexer_token_id_t delimiter_id,
parser_t parser) {
ast_node_t *many;
error_t *err = ast_node_alloc(&many);
parse_result_t result;
if (err)
return parse_error(err);
many->id = id;
while (current) {
// Skip beyond the delimiter on all but the first iteration
if (many->len > 0) {
if (current->token.id != delimiter_id)
break;
current = tokenlist_next(current);
if (current == nullptr) {
// FIXME: this isn't quite right, we can't consume the delimiter
// if the next element will fail to parse but it's late and I
// must think this through tomorrow
break;
}
}
result = parser(current);
if (result.err == err_parse_no_match)
break;
if (result.err) {
ast_node_free(many);
return result;
}
err = ast_node_add_child(many, result.node);
if (err) {
ast_node_free(many);
ast_node_free(result.node);
return parse_error(err);
}
current = result.next;
}
if (!allow_none && many->len == 0) {
ast_node_free(many);
return parse_no_match();
}
return parse_success(many, current);
}
parse_result_t parse_any(tokenlist_entry_t *current, parser_t parsers[]) {
parser_t parser;
while ((parser = *parsers++)) {
parse_result_t result = parser(current);
if (result.err == nullptr)
return result;
}
return parse_no_match();
}
// parse as many of the giver parsers objects in a row as possible,
// potentially allowing none wraps the found objects in a new ast node with
// the given note id
parse_result_t parse_many(tokenlist_entry_t *current, node_id_t id,
bool allow_none, parser_t parser) {
ast_node_t *many;
error_t *err = ast_node_alloc(&many);
parse_result_t result;
if (err)
return parse_error(err);
many->id = id;
while (current) {
result = parser(current);
if (result.err == err_parse_no_match)
break;
if (result.err) {
ast_node_free(many);
return result;
}
err = ast_node_add_child(many, result.node);
if (err) {
ast_node_free(many);
ast_node_free(result.node);
return parse_error(err);
}
current = result.next;
}
if (!allow_none && many->len == 0) {
ast_node_free(many);
return parse_no_match();
}
return parse_success(many, current);
}
// Parse all tries to parse all parsers consecutively and if it succeeds it
// wraps the parsed nodes in a new parent node.
parse_result_t parse_consecutive(tokenlist_entry_t *current, node_id_t id,
parser_t parsers[]) {
ast_node_t *all;
error_t *err = ast_node_alloc(&all);
parse_result_t result;
if (err)
return parse_error(err);
all->id = id;
parser_t parser;
while ((parser = *parsers++) && current) {
result = parser(current);
if (result.err) {
ast_node_free(all);
return result;
}
err = ast_node_add_child(all, result.node);
if (err) {
ast_node_free(result.node);
ast_node_free(all);
return parse_error(err);
}
current = result.next;
}
return parse_success(all, current);
}

25
src/parser/combinators.h Normal file
View File

@ -0,0 +1,25 @@
#ifndef INCLUDE_PARSER_COMBINATORS_H_
#define INCLUDE_PARSER_COMBINATORS_H_
#include "util.h"
typedef parse_result_t (*parser_t)(tokenlist_entry_t *);
parse_result_t parse_any(tokenlist_entry_t *current, parser_t parsers[]);
// parse as many of the giver parsers objects in a row as possible, potentially
// allowing none wraps the found objects in a new ast node with the given note
// id
parse_result_t parse_many(tokenlist_entry_t *current, node_id_t id,
bool allow_none, parser_t parser);
parse_result_t parse_list(tokenlist_entry_t *current, node_id_t id,
bool allow_none, lexer_token_id_t delimiter_id,
parser_t parser);
// Parse all tries to parse all parsers consecutively and if it succeeds it
// wraps the parsed nodes in a new parent node.
parse_result_t parse_consecutive(tokenlist_entry_t *current, node_id_t id,
parser_t parsers[]);
#endif // INCLUDE_PARSER_COMBINATORS_H_

140
src/parser/parser.c Normal file
View File

@ -0,0 +1,140 @@
#include "parser.h"
#include "../ast.h"
#include "../lexer.h"
#include "../tokenlist.h"
#include "combinators.h"
#include "primitives.h"
#include "util.h"
parse_result_t parse_number(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_octal, parse_decimal, parse_hexadecimal,
parse_binary, nullptr};
parse_result_t result = parse_any(current, parsers);
return parse_result_wrap(NODE_NUMBER, result);
}
parse_result_t parse_plus_or_minus(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_plus, parse_minus, nullptr};
return parse_any(current, parsers);
}
parse_result_t parse_register_index(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_plus, parse_register, parse_asterisk,
parse_number, nullptr};
return parse_consecutive(current, NODE_REGISTER_INDEX, parsers);
}
parse_result_t parse_register_offset(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_plus_or_minus, parse_number, nullptr};
return parse_consecutive(current, NODE_REGISTER_OFFSET, parsers);
}
parse_result_t parse_register_expression(tokenlist_entry_t *current) {
parse_result_t result;
ast_node_t *expr;
error_t *err = ast_node_alloc(&expr);
if (err)
return parse_error(err);
expr->id = NODE_REGISTER_EXPRESSION;
// <register>
result = parse_register(current);
if (result.err) {
ast_node_free(expr);
return result;
}
err = ast_node_add_child(expr, result.node);
if (err) {
ast_node_free(result.node);
ast_node_free(expr);
return parse_error(err);
}
current = result.next;
// <register_index>?
result = parse_register_index(current);
if (result.err) {
error_free(result.err);
} else {
err = ast_node_add_child(expr, result.node);
if (err) {
ast_node_free(result.node);
ast_node_free(expr);
return parse_error(err);
}
current = result.next;
}
// <register_offset>?
result = parse_register_offset(current);
if (result.err) {
error_free(result.err);
} else {
err = ast_node_add_child(expr, result.node);
if (err) {
ast_node_free(result.node);
ast_node_free(expr);
return parse_error(err);
}
current = result.next;
}
return parse_success(expr, current);
}
parse_result_t parse_immediate(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_number, parse_identifier, nullptr};
parse_result_t result = parse_any(current, parsers);
return parse_result_wrap(NODE_IMMEDIATE, result);
}
parse_result_t parse_memory_expression(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_register_expression, parse_identifier, nullptr};
return parse_any(current, parsers);
}
parse_result_t parse_memory(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_lbracket, parse_memory_expression,
parse_rbracket, nullptr};
return parse_consecutive(current, NODE_MEMORY, parsers);
}
parse_result_t parse_operand(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_register, parse_memory, parse_immediate,
nullptr};
return parse_any(current, parsers);
}
parse_result_t parse_operands(tokenlist_entry_t *current) {
return parse_list(current, NODE_OPERANDS, true, TOKEN_COMMA, parse_operand);
}
parse_result_t parse_label(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_identifier, parse_colon, nullptr};
return parse_consecutive(current, NODE_LABEL, parsers);
}
parse_result_t parse_section_directive(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_section, parse_identifier, nullptr};
return parse_consecutive(current, NODE_SECTION_DIRECTIVE, parsers);
}
parse_result_t parse_directive(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_dot, parse_section_directive, nullptr};
return parse_consecutive(current, NODE_DIRECTIVE, parsers);
}
parse_result_t parse_instruction(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_identifier, parse_operands, nullptr};
return parse_consecutive(current, NODE_INSTRUCTION, parsers);
}
parse_result_t parse_statement(tokenlist_entry_t *current) {
parser_t parsers[] = {parse_label, parse_directive, parse_instruction,
nullptr};
return parse_any(current, parsers);
}
parse_result_t parse(tokenlist_entry_t *current) {
return parse_many(current, NODE_PROGRAM, true, parse_statement);
}

9
src/parser/parser.h Normal file
View File

@ -0,0 +1,9 @@
#ifndef INCLUDE_PARSER_PARSER_H_
#define INCLUDE_PARSER_PARSER_H_
#include "../tokenlist.h"
#include "util.h"
parse_result_t parse(tokenlist_entry_t *current);
#endif // INCLUDE_PARSER_PARSER_H_

103
src/parser/primitives.c Normal file
View File

@ -0,0 +1,103 @@
#include "primitives.h"
#include "../ast.h"
#include <string.h>
parse_result_t parse_identifier(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_IDENTIFIER, NODE_IDENTIFIER, nullptr);
}
parse_result_t parse_decimal(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_DECIMAL, NODE_DECIMAL, nullptr);
}
parse_result_t parse_hexadecimal(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_HEXADECIMAL, NODE_HEXADECIMAL, nullptr);
}
parse_result_t parse_binary(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_BINARY, NODE_BINARY, nullptr);
}
parse_result_t parse_octal(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_OCTAL, NODE_OCTAL, nullptr);
}
parse_result_t parse_string(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_STRING, NODE_STRING, nullptr);
}
parse_result_t parse_char(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_CHAR, NODE_CHAR, nullptr);
}
parse_result_t parse_colon(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_COLON, NODE_COLON, nullptr);
}
parse_result_t parse_comma(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_COMMA, NODE_COMMA, nullptr);
}
parse_result_t parse_lbracket(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_LBRACKET, NODE_LBRACKET, nullptr);
}
parse_result_t parse_rbracket(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_RBRACKET, NODE_RBRACKET, nullptr);
}
parse_result_t parse_plus(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_PLUS, NODE_PLUS, nullptr);
}
parse_result_t parse_minus(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_MINUS, NODE_MINUS, nullptr);
}
parse_result_t parse_asterisk(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_ASTERISK, NODE_ASTERISK, nullptr);
}
parse_result_t parse_dot(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_DOT, NODE_DOT, nullptr);
}
parse_result_t parse_label_reference(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_IDENTIFIER, NODE_LABEL_REFERENCE,
nullptr);
}
const char *registers[] = {
// 64-bit registers
"rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi", "r8", "r9", "r10",
"r11", "r12", "r13", "r14", "r15",
// 32-bit registers
"eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi", "r8d", "r9d",
"r10d", "r11d", "r12d", "r13d", "r14d", "r15d",
// 16-bit registers
"ax", "cx", "dx", "bx", "sp", "bp", "si", "di", "r8w", "r9w", "r10w",
"r11w", "r12w", "r13w", "r14w", "r15w",
// 8-bit low registers
"al", "cl", "dl", "bl", "spl", "bpl", "sil", "dil", "r8b", "r9b", "r10b",
"r11b", "r12b", "r13b", "r14b", "r15b", nullptr};
bool is_register_token(lexer_token_t *token) {
for (size_t i = 0; registers[i] != nullptr; ++i)
if (strcmp(token->value, registers[i]) == 0)
return true;
return false;
}
parse_result_t parse_register(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_IDENTIFIER, NODE_REGISTER,
is_register_token);
}
bool is_section_token(lexer_token_t *token) {
return strcmp(token->value, "section") == 0;
}
parse_result_t parse_section(tokenlist_entry_t *current) {
return parse_token(current, TOKEN_IDENTIFIER, NODE_SECTION,
is_section_token);
}

30
src/parser/primitives.h Normal file
View File

@ -0,0 +1,30 @@
#ifndef INCLUDE_PARSER_PRIMITIVES_H_
#define INCLUDE_PARSER_PRIMITIVES_H_
#include "util.h"
parse_result_t parse_identifier(tokenlist_entry_t *current);
parse_result_t parse_decimal(tokenlist_entry_t *current);
parse_result_t parse_hexadecimal(tokenlist_entry_t *current);
parse_result_t parse_binary(tokenlist_entry_t *current);
parse_result_t parse_octal(tokenlist_entry_t *current);
parse_result_t parse_string(tokenlist_entry_t *current);
parse_result_t parse_char(tokenlist_entry_t *current);
parse_result_t parse_colon(tokenlist_entry_t *current);
parse_result_t parse_comma(tokenlist_entry_t *current);
parse_result_t parse_lbracket(tokenlist_entry_t *current);
parse_result_t parse_rbracket(tokenlist_entry_t *current);
parse_result_t parse_plus(tokenlist_entry_t *current);
parse_result_t parse_minus(tokenlist_entry_t *current);
parse_result_t parse_asterisk(tokenlist_entry_t *current);
parse_result_t parse_dot(tokenlist_entry_t *current);
parse_result_t parse_label_reference(tokenlist_entry_t *current);
/* These are "primitives" with a different name and some extra validation on top
* for example, register is just an identifier but it only matches a limited set
* of values
*/
parse_result_t parse_register(tokenlist_entry_t *current);
parse_result_t parse_section(tokenlist_entry_t *current);
#endif // INCLUDE_PARSER_PRIMITIVES_H_

56
src/parser/util.c Normal file
View File

@ -0,0 +1,56 @@
#include "util.h"
#include "../tokenlist.h"
error_t *err_parse_no_match =
&(error_t){.message = "parsing failed to find the correct token sequence"};
parse_result_t parse_error(error_t *err) {
return (parse_result_t){.err = err};
}
parse_result_t parse_no_match() {
return parse_error(err_parse_no_match);
}
parse_result_t parse_success(ast_node_t *ast, tokenlist_entry_t *next) {
next = tokenlist_skip_trivia(next);
return (parse_result_t){.node = ast, .next = next};
}
parse_result_t parse_token(tokenlist_entry_t *current,
lexer_token_id_t token_id, node_id_t ast_id,
token_validator_t is_valid) {
if (current->token.id != token_id ||
(is_valid && !is_valid(&current->token)))
return parse_no_match();
ast_node_t *node;
error_t *err = ast_node_alloc(&node);
if (err)
return parse_error(err);
node->id = ast_id;
node->token_entry = current;
return parse_success(node, current->next);
}
parse_result_t parse_result_wrap(node_id_t id, parse_result_t result) {
if (result.err)
return result;
ast_node_t *node;
error_t *err = ast_node_alloc(&node);
if (err) {
ast_node_free(result.node);
return parse_error(err);
}
node->id = id;
err = ast_node_add_child(node, result.node);
if (err) {
ast_node_free(result.node);
return parse_error(err);
}
return parse_success(node, result.next);
}

26
src/parser/util.h Normal file
View File

@ -0,0 +1,26 @@
#ifndef INCLUDE_PARSER_UTIL_H_
#define INCLUDE_PARSER_UTIL_H_
#include "../ast.h"
#include "../error.h"
#include "../tokenlist.h"
typedef struct parse_result {
error_t *err;
tokenlist_entry_t *next;
ast_node_t *node;
} parse_result_t;
typedef bool (*token_validator_t)(lexer_token_t *);
parse_result_t parse_error(error_t *err);
parse_result_t parse_no_match();
parse_result_t parse_success(ast_node_t *ast, tokenlist_entry_t *next);
parse_result_t parse_token(tokenlist_entry_t *current,
lexer_token_id_t token_id, node_id_t ast_id,
token_validator_t is_valid);
parse_result_t parse_result_wrap(node_id_t id, parse_result_t result);
extern error_t *err_parse_no_match;
#endif // INCLUDE_PARSER_UTIL_H_

106
src/tokenlist.c Normal file
View File

@ -0,0 +1,106 @@
#include "tokenlist.h"
#include "error.h"
#include "lexer.h"
#include <stdlib.h>
error_t *tokenlist_alloc(tokenlist_t **output) {
*output = nullptr;
tokenlist_t *list = calloc(1, sizeof(tokenlist_t));
if (list == nullptr)
return err_allocation_failed;
list->head = nullptr;
list->tail = nullptr;
*output = list;
return nullptr;
}
error_t *tokenlist_entry_alloc(tokenlist_entry_t **output) {
*output = nullptr;
tokenlist_entry_t *entry = calloc(1, sizeof(tokenlist_entry_t));
if (entry == nullptr)
return err_allocation_failed;
entry->next = nullptr;
entry->prev = nullptr;
*output = entry;
return nullptr;
}
void tokenlist_append(tokenlist_t *list, tokenlist_entry_t *entry) {
if (list->head == nullptr) {
list->head = entry;
list->tail = entry;
entry->next = nullptr;
entry->prev = nullptr;
} else {
entry->prev = list->tail;
entry->next = nullptr;
list->tail->next = entry;
list->tail = entry;
}
}
void tokenlist_entry_free(tokenlist_entry_t *entry) {
lexer_token_cleanup(&entry->token);
free(entry);
}
void tokenlist_free(tokenlist_t *list) {
if (list == nullptr)
return;
tokenlist_entry_t *current = list->head;
while (current) {
tokenlist_entry_t *next = current->next;
tokenlist_entry_free(current);
current = next;
}
free(list);
}
error_t *tokenlist_fill(tokenlist_t *list, lexer_t *lex) {
error_t *err = nullptr;
lexer_token_t token = {};
while ((err = lexer_next(lex, &token)) == nullptr) {
tokenlist_entry_t *entry;
err = tokenlist_entry_alloc(&entry);
if (err) {
lexer_token_cleanup(&token);
return err;
}
entry->token = token;
tokenlist_append(list, entry);
}
if (err != err_eof)
return err;
return nullptr;
}
bool is_trivia(tokenlist_entry_t *trivia) {
switch (trivia->token.id) {
case TOKEN_WHITESPACE:
case TOKEN_COMMENT:
case TOKEN_NEWLINE:
return true;
default:
return false;
}
}
tokenlist_entry_t *tokenlist_skip_trivia(tokenlist_entry_t *current) {
while (current && is_trivia(current))
current = current->next;
return current;
}
tokenlist_entry_t *tokenlist_next(tokenlist_entry_t *current) {
if (!current)
return nullptr;
return tokenlist_skip_trivia(current->next);
}

40
src/tokenlist.h Normal file
View File

@ -0,0 +1,40 @@
#ifndef INCLUDE_SRC_TOKENLIST_H_
#define INCLUDE_SRC_TOKENLIST_H_
#include "lexer.h"
typedef struct tokenlist_entry tokenlist_entry_t;
struct tokenlist_entry {
lexer_token_t token;
tokenlist_entry_t *next;
tokenlist_entry_t *prev;
};
typedef struct tokenlist {
tokenlist_entry_t *head;
tokenlist_entry_t *tail;
} tokenlist_t;
/**
* @brief Allocate a new doubly linked list of lexer tokens
*/
error_t *tokenlist_alloc(tokenlist_t **list);
/**
* Consume all tokens from the lexer and add them to the list
*/
error_t *tokenlist_fill(tokenlist_t *list, lexer_t *lex);
void tokenlist_free(tokenlist_t *list);
/**
* Return the first token entry that isn't whitespace, newline or comment
*/
tokenlist_entry_t *tokenlist_skip_trivia(tokenlist_entry_t *current);
/**
* Return the next token entry that isn't whitespace, newline or comment
*/
tokenlist_entry_t *tokenlist_next(tokenlist_entry_t *current);
#endif // INCLUDE_SRC_TOKENLIST_H_

View File

@ -1,5 +1,17 @@
.section text
; Small valid code snippet that should contain all different AST nodes
_start: _start:
mov eax, 555 ; move 555 into eax mov eax, ebx
lea eax, [eax + ebx * 4 + 8]
lea eax, [eax + 8]
lea eax, [eax + ebx * 8]
lea eax, [esp - 24]
lea eax, [eax + ebx * 4 - 8]
lea eax, [_start]
mov eax, _start
mov eax, 555
push 0o777 push 0o777
xor eax, 0xDEADBEEF xor eax, 0xDEADBEEF
and ecx, 0o770 and ecx, 0o770

View File

@ -10,7 +10,7 @@ scan-build -o reports/static-analysis/ -plist-html --status-bugs make all
# Run the sanitizer builds and valgrind # Run the sanitizer builds and valgrind
make clean sanitize all make clean sanitize all
ARGUMENTS=("-tokens" "-text") ARGUMENTS=("tokens" "text" "ast")
while IFS= read -r INPUT_FILE; do while IFS= read -r INPUT_FILE; do
for ARGS in ${ARGUMENTS[@]}; do for ARGS in ${ARGUMENTS[@]}; do
./oas-asan $ARGS $INPUT_FILE > /dev/null ./oas-asan $ARGS $INPUT_FILE > /dev/null