Initial commit, basic lexer structure
This commit is contained in:
commit
df948b18c6
5
.clang-format
Normal file
5
.clang-format
Normal file
@ -0,0 +1,5 @@
|
||||
BasedOnStyle: LLVM
|
||||
IndentWidth: 4
|
||||
Cpp11BracedListStyle: true
|
||||
AlignArrayOfStructures: Left
|
||||
AllowShortFunctionsOnASingleLine: Empty
|
7
.gitignore
vendored
Normal file
7
.gitignore
vendored
Normal file
@ -0,0 +1,7 @@
|
||||
*.o
|
||||
*.d
|
||||
/core
|
||||
/oas
|
||||
/oas-asan
|
||||
/oas-msan
|
||||
/reports
|
19
LICENSE
Normal file
19
LICENSE
Normal file
@ -0,0 +1,19 @@
|
||||
Copyright (c) 2025 omicron <omicron.me@protonmail.com>
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
this software and associated documentation files (the "Software"), to deal in
|
||||
the Software without restriction, including without limitation the rights to
|
||||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
of the Software, and to permit persons to whom the Software is furnished to do
|
||||
so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
42
Makefile
Normal file
42
Makefile
Normal file
@ -0,0 +1,42 @@
|
||||
.PHONY: all clean clean-objects run sanitize validate
|
||||
|
||||
CC=clang
|
||||
LD=clang
|
||||
CFLAGS=-Wall -Wextra -Wpedantic -O0 -g3 -std=c23 -fno-omit-frame-pointer -fno-optimize-sibling-calls
|
||||
LDFLAGS?=
|
||||
|
||||
SOURCES = $(shell find src/ -type f -name '*.c')
|
||||
OBJECTS = $(SOURCES:.c=.o)
|
||||
DEPENDENCIES = $(SOURCES:.c=.d)
|
||||
TARGET?=oas
|
||||
OUTPUTS=oas oas-asan oas-msan
|
||||
RUNARGUMENTS=-tokens test.asm
|
||||
|
||||
all: $(TARGET)
|
||||
|
||||
|
||||
run: $(TARGET)
|
||||
./$(TARGET) $(RUNARGUMENTS)
|
||||
|
||||
sanitize:
|
||||
make CFLAGS="$(CFLAGS) -fsanitize=address,undefined" LDFLAGS="-fsanitize=address,undefined" TARGET="oas-asan" clean-objects all
|
||||
make CFLAGS="$(CFLAGS) -fsanitize=memory -fsanitize-memory-track-origins=2" LDFLAGS="-fsanitize=memory -fsanitize-memory-track-origins=2" TARGET="oas-msan" clean-objects all
|
||||
make clean-objects
|
||||
|
||||
validate:
|
||||
./validate.sh
|
||||
|
||||
$(TARGET): $(OBJECTS)
|
||||
$(LD) $(LDFLAGS) -o $@ $^
|
||||
|
||||
%.o: %.c
|
||||
$(CC) $(CFLAGS) -MMD -MP -c $< -o $@
|
||||
|
||||
-include $(DEPENDENCIES)
|
||||
|
||||
clean-objects:
|
||||
rm -f $(OBJECTS) $(DEPENDENCIES)
|
||||
|
||||
clean: clean-objects
|
||||
rm -f $(TARGET) $(OUTPUTS)
|
||||
rm -rf reports/
|
46
doc/lexer_grammar.txt
Normal file
46
doc/lexer_grammar.txt
Normal file
@ -0,0 +1,46 @@
|
||||
/* These non-terminals are the actual tokens the lexer emits */
|
||||
<identifier> ::= <identifier_start> <identifier_character>+
|
||||
<decimal> ::= [0-9]+
|
||||
|
||||
<hexadecimal> ::= "0x" <hex_digit>+ <number_suffix>?
|
||||
<binary> ::= "0b" [0-1]+ <number_suffix>?
|
||||
<octal> ::= "0o" [0-7]+ <number_suffix>?
|
||||
<string> ::= "\"" <string_unit>+ "\""
|
||||
<character> ::= "'" <character_unit> "'"
|
||||
<colon> ::= ":"
|
||||
<comma> ::= ","
|
||||
<lbracket> ::= "["
|
||||
<rbracket> ::= "]"
|
||||
<plus> ::= "+"
|
||||
<minus> ::= "-"
|
||||
<asterisk> ::= "*"
|
||||
<dot> ::= "."
|
||||
<comment> ::= ";" <comment_character>*
|
||||
<newline> ::= "\r"? "\n"
|
||||
<whitespace> ::= ( " " | "\t" )+
|
||||
|
||||
/* helper non-terminals to make it easier to define the tokens */
|
||||
<number_suffix> ::= ":" ( "8" | "16" | "32" | "64" )
|
||||
|
||||
<identifier_start> ::= [a-z] | [A-Z] | "_"
|
||||
<identifier_character> ::= [a-z] | [A-Z] | [0-9] | "_"
|
||||
|
||||
<hex_digit> ::= [a-f] | [A-F]
|
||||
|
||||
<string_unit> ::= <string_regular> | <escaped>
|
||||
<character_unit> ::= <character_regular> | <escaped>
|
||||
|
||||
<escaped> ::= "\\" ( <escape_list> | <escape_hex> )
|
||||
<escape_list> ::= "\\" | "n" | "r" | "t" | "0" | "\"" | "'"
|
||||
<escape_hex> ::= "x" <hex_digit> <hex_digit>
|
||||
|
||||
/* alternative definitions to support bnfplayground, use the ones below instead */
|
||||
<comment_character> ::= <shared_regular> | "'" | "\""
|
||||
<string_regular> ::= <shared_regular> | "'"
|
||||
<character_regular> ::= <shared_regular> | "\""
|
||||
<shared_regular> ::= [a-z] | [A-Z] | [0-9] | " " | "+" | "-" | "#" | "\t" | "_" | "$" | "&" | "{" | "}" | "(" | ")" | "|"
|
||||
|
||||
/* actual definition we're implementing */
|
||||
/* <comment_character> ::= [^\r\n] */
|
||||
/* <character_regular> ::= [^\\'] */
|
||||
/* <string_regular> ::= [^\\"] */
|
42
src/error.c
Normal file
42
src/error.c
Normal file
@ -0,0 +1,42 @@
|
||||
#include "error.h"
|
||||
|
||||
#include <stdarg.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
error_t *const err_errorf_alloc = &(error_t){
|
||||
.message = "Allocation failed during formatting of another error"};
|
||||
error_t *const err_errorf_length = &(error_t){
|
||||
.message =
|
||||
"Formatting of another error failed to determine the error length"};
|
||||
|
||||
error_t *errorf(const char *fmt, ...) {
|
||||
error_t *err = calloc(1, sizeof(error_t));
|
||||
if (err == nullptr)
|
||||
return err_errorf_alloc;
|
||||
|
||||
va_list args;
|
||||
va_list args_count;
|
||||
va_start(args, fmt);
|
||||
va_copy(args_count, args);
|
||||
|
||||
int size = vsnprintf(nullptr, 0, fmt, args_count) + 1;
|
||||
va_end(args_count);
|
||||
if (size <= 0) {
|
||||
free(err);
|
||||
va_end(args);
|
||||
return err_errorf_length;
|
||||
}
|
||||
|
||||
err->message = malloc(size);
|
||||
if (err->message == nullptr) {
|
||||
free(err);
|
||||
va_end(args);
|
||||
return err_errorf_alloc;
|
||||
}
|
||||
|
||||
vsnprintf(err->message, size, fmt, args);
|
||||
va_end(args);
|
||||
err->is_heap_allocated = true;
|
||||
return err;
|
||||
}
|
21
src/error.h
Normal file
21
src/error.h
Normal file
@ -0,0 +1,21 @@
|
||||
#ifndef INCLUDE_SRC_ERROR_H_
|
||||
#define INCLUDE_SRC_ERROR_H_
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
typedef struct error {
|
||||
char *message;
|
||||
bool is_heap_allocated;
|
||||
} error_t;
|
||||
|
||||
error_t *errorf(const char *fmt, ...);
|
||||
static inline void error_free(error_t *err) {
|
||||
if (err == nullptr)
|
||||
return;
|
||||
if (!err->is_heap_allocated)
|
||||
return;
|
||||
free(err->message);
|
||||
free(err);
|
||||
}
|
||||
|
||||
#endif // INCLUDE_SRC_ERROR_H_
|
465
src/lexer.c
Normal file
465
src/lexer.c
Normal file
@ -0,0 +1,465 @@
|
||||
#include "lexer.h"
|
||||
#include "error.h"
|
||||
#include <assert.h>
|
||||
#include <ctype.h>
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
|
||||
error_t *err_lexer_already_open = &(error_t){
|
||||
.message =
|
||||
"Can't open on a lexer object that is already opened. Close it first."};
|
||||
error_t *err_prefix_too_large =
|
||||
&(error_t){.message = "Prefix too large for internal lexer buffer"};
|
||||
error_t *err_buffer_underrun = &(error_t){
|
||||
.message = "Buffer does not contain enough characters for lexer_consume_n"};
|
||||
error_t *err_consume_excessive_length =
|
||||
&(error_t){.message = "Too many valid characters to consume"};
|
||||
|
||||
error_t *err_eof =
|
||||
&(error_t){.message = "Can't read from file because EOF is reached"};
|
||||
|
||||
error_t *err_unknown_read = &(error_t){.message = "Unknown read error"};
|
||||
|
||||
error_t *err_allocation_failed =
|
||||
&(error_t){.message = "Memory allocation failed"};
|
||||
|
||||
typedef bool (*char_predicate_t)(char);
|
||||
|
||||
const char *lexer_token_id_to_cstr(lexer_token_id_t id) {
|
||||
switch (id) {
|
||||
case TOKEN_ERROR:
|
||||
return "TOKEN_ERROR";
|
||||
case TOKEN_IDENTIFIER:
|
||||
return "TOKEN_IDENTIFIER";
|
||||
case TOKEN_DECIMAL:
|
||||
return "TOKEN_DECIMAL";
|
||||
case TOKEN_HEXADECIMAL:
|
||||
return "TOKEN_HEXADECIMAL";
|
||||
case TOKEN_OCTAL:
|
||||
return "TOKEN_OCTAL";
|
||||
case TOKEN_BINARY:
|
||||
return "TOKEN_BINARY";
|
||||
case TOKEN_CHAR:
|
||||
return "TOKEN_CHAR";
|
||||
case TOKEN_STRING:
|
||||
return "TOKEN_STRING";
|
||||
case TOKEN_COLON:
|
||||
return "TOKEN_COLON";
|
||||
case TOKEN_COMMA:
|
||||
return "TOKEN_COMMA";
|
||||
case TOKEN_LBRACKET:
|
||||
return "TOKEN_LBRACKET";
|
||||
case TOKEN_RBRACKET:
|
||||
return "TOKEN_RBRACKET";
|
||||
case TOKEN_PLUS:
|
||||
return "TOKEN_PLUS";
|
||||
case TOKEN_MINUS:
|
||||
return "TOKEN_MINUS";
|
||||
case TOKEN_ASTERISK:
|
||||
return "TOKEN_ASTERISK";
|
||||
case TOKEN_DOT:
|
||||
return "TOKEN_DOT";
|
||||
case TOKEN_COMMENT:
|
||||
return "TOKEN_COMMENT";
|
||||
case TOKEN_NEWLINE:
|
||||
return "TOKEN_NEWLINE";
|
||||
case TOKEN_WHITESPACE:
|
||||
return "TOKEN_WHITESPACE";
|
||||
}
|
||||
assert(!"Unreachable, weird token id" && id);
|
||||
__builtin_unreachable();
|
||||
}
|
||||
|
||||
void lexer_token_print(lexer_token_t *token) {
|
||||
printf("(%zu, %zu) %s[%d]%s%s\n", token->line_number,
|
||||
token->character_number, lexer_token_id_to_cstr(token->id),
|
||||
token->id, token->value ? ": " : "",
|
||||
token->value ? token->value : "");
|
||||
if (token->id == TOKEN_ERROR)
|
||||
printf(" `--> %s\n", token->explanation);
|
||||
}
|
||||
|
||||
void lexer_token_cleanup(lexer_token_t *token) {
|
||||
free(token->value);
|
||||
memset(token, 0, sizeof(lexer_token_t));
|
||||
}
|
||||
|
||||
void lexer_close(lexer_t *lex) {
|
||||
fclose(lex->fp);
|
||||
memset(lex, 0, sizeof(lexer_t));
|
||||
}
|
||||
|
||||
error_t *lexer_fill_buffer(lexer_t *lex) {
|
||||
if (feof(lex->fp) && lex->buffer_count == 0)
|
||||
return err_eof;
|
||||
if (feof(lex->fp))
|
||||
return nullptr;
|
||||
if (lex->buffer_count == lexer_buffer_size)
|
||||
return nullptr;
|
||||
|
||||
size_t remaining = lexer_buffer_size - lex->buffer_count;
|
||||
while (remaining > 0) {
|
||||
char *buffer = lex->buffer + lex->buffer_count;
|
||||
size_t n = fread(buffer, 1, remaining, lex->fp);
|
||||
if (n == 0 && feof(lex->fp))
|
||||
break;
|
||||
if (n == 0 && ferror(lex->fp))
|
||||
return errorf("Read error: %s", strerror(errno));
|
||||
if (n == 0)
|
||||
return err_unknown_read;
|
||||
remaining -= n;
|
||||
lex->buffer_count += n;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
error_t *lexer_open(lexer_t *lex, char *path) {
|
||||
if (lex->fp != nullptr)
|
||||
return err_lexer_already_open;
|
||||
|
||||
lex->fp = fopen(path, "rb");
|
||||
if (lex->fp == nullptr)
|
||||
return errorf("Failed to open file '%s': %s", path, strerror(errno));
|
||||
lex->line_number = 0;
|
||||
lex->character_number = 0;
|
||||
lex->buffer_count = 0;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void lexer_shift_buffer(lexer_t *lex, int n) {
|
||||
lex->buffer_count -= n;
|
||||
memmove(lex->buffer, lex->buffer + n, lex->buffer_count);
|
||||
}
|
||||
|
||||
error_t *lexer_peek(lexer_t *lex, char *c) {
|
||||
error_t *err = lexer_fill_buffer(lex);
|
||||
if (err)
|
||||
return err;
|
||||
if (lex->buffer_count == 0)
|
||||
return err_eof;
|
||||
*c = lex->buffer[0];
|
||||
lexer_shift_buffer(lex, 1);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// This does _not_ fill the internal lexer buffer and you _must_ call
|
||||
// lexer_fill_buffer() before calling this. It will always return false if your
|
||||
// prefix is larger than lexer_buffer_size
|
||||
bool lexer_has_prefix(lexer_t *lex, char *prefix) {
|
||||
size_t len = strlen(prefix);
|
||||
if (len > lex->buffer_count)
|
||||
return false;
|
||||
return memcmp(lex->buffer, prefix, len) == 0;
|
||||
}
|
||||
|
||||
error_t *lexer_not_implemented(lexer_t *lex, lexer_token_t *token) {
|
||||
(void)token;
|
||||
return errorf("Not implemented, character %02x (%c) at (%zu, %zu).\n",
|
||||
lex->buffer[0], lex->buffer[0], lex->line_number,
|
||||
lex->character_number);
|
||||
}
|
||||
|
||||
error_t *lexer_consume_n(lexer_t *lex, const size_t len,
|
||||
char buffer[static len], const size_t n) {
|
||||
if (lex->buffer_count < n)
|
||||
return err_buffer_underrun;
|
||||
if (len > n)
|
||||
return err_consume_excessive_length;
|
||||
|
||||
memcpy(buffer, lex->buffer, n);
|
||||
lexer_shift_buffer(lex, n);
|
||||
return nullptr;
|
||||
}
|
||||
error_t *lexer_consume(lexer_t *lex, const size_t n, char buffer[static n],
|
||||
char_predicate_t is_valid, size_t *n_consumed) {
|
||||
const size_t buffer_size = n;
|
||||
bool have_more_characters = false;
|
||||
*n_consumed = 0;
|
||||
do {
|
||||
size_t i = 0;
|
||||
while (i < lex->buffer_count && i < buffer_size - *n_consumed &&
|
||||
is_valid(lex->buffer[i])) {
|
||||
++i;
|
||||
}
|
||||
memcpy(buffer + *n_consumed, lex->buffer, i);
|
||||
lexer_shift_buffer(lex, i);
|
||||
*n_consumed += i;
|
||||
|
||||
error_t *err = lexer_fill_buffer(lex);
|
||||
if (err == err_eof)
|
||||
have_more_characters = false;
|
||||
else if (err)
|
||||
return err;
|
||||
else
|
||||
have_more_characters =
|
||||
(lex->buffer_count > 0 && is_valid(lex->buffer[0]));
|
||||
|
||||
if (have_more_characters && *n_consumed == buffer_size) {
|
||||
return err_consume_excessive_length;
|
||||
}
|
||||
} while (have_more_characters);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
bool is_hexadecimal_character(char c) {
|
||||
return isdigit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
|
||||
}
|
||||
|
||||
bool is_octal_character(char c) {
|
||||
return c >= '0' && c <= '7';
|
||||
}
|
||||
|
||||
bool is_binary_character(char c) {
|
||||
return c == '0' || c == '1';
|
||||
}
|
||||
|
||||
bool is_decimal_character(char c) {
|
||||
return isdigit(c);
|
||||
}
|
||||
|
||||
error_t *lexer_next_number(lexer_t *lex, lexer_token_t *token) {
|
||||
constexpr size_t max_number_length = 128;
|
||||
size_t so_far = 0;
|
||||
size_t n = 0;
|
||||
char buffer[max_number_length + 1] = {};
|
||||
|
||||
token->line_number = lex->line_number;
|
||||
token->character_number = lex->character_number;
|
||||
char_predicate_t is_valid;
|
||||
if (lexer_has_prefix(lex, "0x")) {
|
||||
is_valid = is_hexadecimal_character;
|
||||
token->id = TOKEN_HEXADECIMAL;
|
||||
strcpy(buffer, "0x");
|
||||
so_far = 2;
|
||||
} else if (lexer_has_prefix(lex, "0o")) {
|
||||
is_valid = is_octal_character;
|
||||
token->id = TOKEN_OCTAL;
|
||||
strcpy(buffer, "0o");
|
||||
so_far = 2;
|
||||
} else if (lexer_has_prefix(lex, "0b")) {
|
||||
token->id = TOKEN_BINARY;
|
||||
is_valid = is_binary_character;
|
||||
strcpy(buffer, "0b");
|
||||
so_far = 2;
|
||||
} else {
|
||||
token->id = TOKEN_DECIMAL;
|
||||
is_valid = is_decimal_character;
|
||||
so_far = 0;
|
||||
}
|
||||
if (so_far > 0) {
|
||||
lex->character_number += so_far;
|
||||
lexer_shift_buffer(lex, so_far);
|
||||
}
|
||||
|
||||
error_t *err = lexer_consume(lex, max_number_length - so_far,
|
||||
buffer + so_far, is_valid, &n);
|
||||
if (err == err_consume_excessive_length) {
|
||||
token->id = TOKEN_ERROR;
|
||||
token->explanation =
|
||||
"Number length exceeds the maximum of 128 characters";
|
||||
}
|
||||
so_far += n;
|
||||
if (n == 0) {
|
||||
token->id = TOKEN_ERROR;
|
||||
token->explanation = "Invalid number format";
|
||||
}
|
||||
|
||||
err = lexer_fill_buffer(lex);
|
||||
if (err != err_eof && err) {
|
||||
return err;
|
||||
}
|
||||
|
||||
size_t suffix_length = 0;
|
||||
if (lexer_has_prefix(lex, ":8")) {
|
||||
suffix_length = 2;
|
||||
} else if (lexer_has_prefix(lex, ":16")) {
|
||||
suffix_length = 3;
|
||||
} else if (lexer_has_prefix(lex, ":32")) {
|
||||
suffix_length = 3;
|
||||
} else if (lexer_has_prefix(lex, ":64")) {
|
||||
suffix_length = 3;
|
||||
}
|
||||
|
||||
if (suffix_length > 0) {
|
||||
err = lexer_consume_n(lex, max_number_length - so_far, buffer + so_far,
|
||||
suffix_length);
|
||||
if (err == err_consume_excessive_length) {
|
||||
token->id = TOKEN_ERROR;
|
||||
token->explanation =
|
||||
"Number length exceeds the maximum of 128 characters";
|
||||
}
|
||||
}
|
||||
|
||||
lex->character_number += n;
|
||||
token->value = strdup(buffer);
|
||||
return nullptr;
|
||||
}
|
||||
error_t *lexer_next_newline(lexer_t *lex, lexer_token_t *token) {
|
||||
token->line_number = lex->line_number;
|
||||
token->character_number = lex->character_number;
|
||||
token->id = TOKEN_NEWLINE;
|
||||
|
||||
if (lexer_has_prefix(lex, "\r\n")) {
|
||||
lexer_shift_buffer(lex, 2);
|
||||
token->value = strdup("\r\n");
|
||||
lex->character_number = 0;
|
||||
lex->line_number += 1;
|
||||
} else if (lexer_has_prefix(lex, "\n")) {
|
||||
lexer_shift_buffer(lex, 1);
|
||||
token->value = strdup("\n");
|
||||
lex->character_number = 0;
|
||||
lex->line_number += 1;
|
||||
} else {
|
||||
token->id = TOKEN_ERROR;
|
||||
lex->character_number += 1;
|
||||
token->value = strdup((char[]){lex->buffer[0]});
|
||||
token->explanation = "Invalid newline format";
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
bool is_identifier_character(char c) {
|
||||
return isalnum(c) || c == '_';
|
||||
}
|
||||
|
||||
error_t *lexer_next_identifier(lexer_t *lex, lexer_token_t *token) {
|
||||
constexpr size_t max_identifier_length = 128;
|
||||
size_t n = 0;
|
||||
char buffer[max_identifier_length + 1] = {};
|
||||
|
||||
token->id = TOKEN_IDENTIFIER;
|
||||
token->line_number = lex->line_number;
|
||||
token->character_number = lex->character_number;
|
||||
|
||||
error_t *err = lexer_consume(lex, max_identifier_length, buffer,
|
||||
is_identifier_character, &n);
|
||||
if (err == err_consume_excessive_length) {
|
||||
token->id = TOKEN_ERROR;
|
||||
token->explanation =
|
||||
"Identifier length exceeds the maximum of 128 characters";
|
||||
}
|
||||
lex->character_number += n;
|
||||
token->value = strdup(buffer);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
error_t *lexer_next_character(lexer_t *lex, lexer_token_t *token) {
|
||||
return lexer_not_implemented(lex, token);
|
||||
}
|
||||
error_t *lexer_next_string(lexer_t *lex, lexer_token_t *token) {
|
||||
return lexer_not_implemented(lex, token);
|
||||
}
|
||||
|
||||
bool is_whitespace_character(char c) {
|
||||
return c == ' ' || c == '\t';
|
||||
}
|
||||
|
||||
error_t *lexer_next_whitespace(lexer_t *lex, lexer_token_t *token) {
|
||||
constexpr size_t max_whitespace_length = 1024;
|
||||
size_t n = 0;
|
||||
char buffer[max_whitespace_length + 1] = {};
|
||||
|
||||
token->id = TOKEN_WHITESPACE;
|
||||
token->line_number = lex->line_number;
|
||||
token->character_number = lex->character_number;
|
||||
|
||||
error_t *err = lexer_consume(lex, max_whitespace_length, buffer,
|
||||
is_whitespace_character, &n);
|
||||
if (err == err_consume_excessive_length) {
|
||||
token->id = TOKEN_ERROR;
|
||||
token->explanation =
|
||||
"Whitespace length exceeds the maximum of 1024 characters";
|
||||
}
|
||||
lex->character_number += n;
|
||||
token->value = strdup(buffer);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
bool is_comment_character(char c) {
|
||||
return c != '\r' && c != '\n';
|
||||
}
|
||||
|
||||
error_t *lexer_next_comment(lexer_t *lex, lexer_token_t *token) {
|
||||
constexpr size_t max_comment_length = 1024;
|
||||
size_t n = 0;
|
||||
char buffer[max_comment_length + 1] = {};
|
||||
|
||||
token->id = TOKEN_COMMENT;
|
||||
token->line_number = lex->line_number;
|
||||
token->character_number = lex->character_number;
|
||||
|
||||
error_t *err = lexer_consume(lex, max_comment_length, buffer,
|
||||
is_comment_character, &n);
|
||||
if (err == err_consume_excessive_length) {
|
||||
token->id = TOKEN_ERROR;
|
||||
token->explanation =
|
||||
"Comment length exceeds the maximum of 1024 characters";
|
||||
}
|
||||
lex->character_number += n;
|
||||
token->value = strdup(buffer);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
error_t *lexer_next(lexer_t *lex, lexer_token_t *token) {
|
||||
memset(token, 0, sizeof(lexer_token_t));
|
||||
error_t *err = lexer_fill_buffer(lex);
|
||||
if (err)
|
||||
return err;
|
||||
char first = lex->buffer[0];
|
||||
if (isalpha(first) || first == '_')
|
||||
return lexer_next_identifier(lex, token);
|
||||
if (isdigit(first))
|
||||
return lexer_next_number(lex, token);
|
||||
|
||||
switch (first) {
|
||||
case '\'':
|
||||
return lexer_next_character(lex, token);
|
||||
case '"':
|
||||
return lexer_next_string(lex, token);
|
||||
case ' ':
|
||||
case '\t':
|
||||
return lexer_next_whitespace(lex, token);
|
||||
case ';':
|
||||
return lexer_next_comment(lex, token);
|
||||
case ':':
|
||||
token->id = TOKEN_COLON;
|
||||
break;
|
||||
case ',':
|
||||
token->id = TOKEN_COMMA;
|
||||
break;
|
||||
case '[':
|
||||
token->id = TOKEN_LBRACKET;
|
||||
break;
|
||||
case ']':
|
||||
token->id = TOKEN_RBRACKET;
|
||||
break;
|
||||
case '+':
|
||||
token->id = TOKEN_PLUS;
|
||||
break;
|
||||
case '-':
|
||||
token->id = TOKEN_MINUS;
|
||||
break;
|
||||
case '*':
|
||||
token->id = TOKEN_ASTERISK;
|
||||
break;
|
||||
case '.':
|
||||
token->id = TOKEN_DOT;
|
||||
break;
|
||||
case '\r':
|
||||
case '\n':
|
||||
return lexer_next_newline(lex, token);
|
||||
default:
|
||||
token->id = TOKEN_ERROR;
|
||||
break;
|
||||
}
|
||||
token->value = strdup((char[]){first, 0});
|
||||
lexer_shift_buffer(lex, 1);
|
||||
token->line_number = lex->line_number;
|
||||
token->character_number = lex->character_number;
|
||||
if (token->id == TOKEN_ERROR) {
|
||||
token->explanation =
|
||||
"unexpected character during lexing (first of token)";
|
||||
}
|
||||
lex->character_number += 1;
|
||||
return nullptr;
|
||||
}
|
56
src/lexer.h
Normal file
56
src/lexer.h
Normal file
@ -0,0 +1,56 @@
|
||||
#ifndef INCLUDE_SRC_LEXER_H_
|
||||
#define INCLUDE_SRC_LEXER_H_
|
||||
|
||||
#include "error.h"
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
|
||||
extern error_t *err_eof;
|
||||
|
||||
typedef enum {
|
||||
TOKEN_ERROR,
|
||||
TOKEN_IDENTIFIER,
|
||||
TOKEN_DECIMAL,
|
||||
TOKEN_HEXADECIMAL,
|
||||
TOKEN_OCTAL,
|
||||
TOKEN_BINARY,
|
||||
TOKEN_CHAR,
|
||||
TOKEN_STRING,
|
||||
TOKEN_COLON,
|
||||
TOKEN_COMMA,
|
||||
TOKEN_LBRACKET,
|
||||
TOKEN_RBRACKET,
|
||||
TOKEN_PLUS,
|
||||
TOKEN_MINUS,
|
||||
TOKEN_ASTERISK,
|
||||
TOKEN_DOT,
|
||||
TOKEN_COMMENT,
|
||||
TOKEN_NEWLINE,
|
||||
TOKEN_WHITESPACE,
|
||||
} lexer_token_id_t;
|
||||
|
||||
typedef struct lexer_token {
|
||||
lexer_token_id_t id;
|
||||
size_t line_number;
|
||||
size_t character_number;
|
||||
char *value;
|
||||
const char *explanation;
|
||||
} lexer_token_t;
|
||||
|
||||
constexpr size_t lexer_buffer_size = 32;
|
||||
|
||||
typedef struct lexer {
|
||||
size_t line_number;
|
||||
size_t character_number;
|
||||
size_t buffer_count;
|
||||
char buffer[lexer_buffer_size];
|
||||
FILE *fp;
|
||||
} lexer_t;
|
||||
|
||||
void lexer_close(lexer_t *lex);
|
||||
error_t *lexer_open(lexer_t *lex, char *path);
|
||||
error_t *lexer_next(lexer_t *lex, lexer_token_t *token);
|
||||
void lexer_token_print(lexer_token_t *token);
|
||||
void lexer_token_cleanup(lexer_token_t *token);
|
||||
|
||||
#endif // INCLUDE_SRC_LEXER_H_
|
62
src/main.c
Normal file
62
src/main.c
Normal file
@ -0,0 +1,62 @@
|
||||
#include "error.h"
|
||||
#include "lexer.h"
|
||||
|
||||
#include <limits.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
bool print_token(lexer_token_t *token) {
|
||||
lexer_token_print(token);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool print_value(lexer_token_t *token) {
|
||||
if (token->id == TOKEN_ERROR) {
|
||||
printf("%s\n", token->value);
|
||||
for (size_t i = 0; i < token->character_number; ++i)
|
||||
printf(" ");
|
||||
printf("^-- %s\n", token->explanation);
|
||||
} else {
|
||||
printf("%s", token->value);
|
||||
}
|
||||
return token->id != TOKEN_ERROR;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
if (argc != 3 ||
|
||||
(strcmp(argv[1], "-tokens") != 0 && strcmp(argv[1], "-text") != 0)) {
|
||||
puts("Usage: oas -tokens <filename>");
|
||||
puts("Usage: oas -text <filename>");
|
||||
return 1;
|
||||
}
|
||||
|
||||
bool (*print_fn)(lexer_token_t *);
|
||||
char *filename = argv[2];
|
||||
if (strcmp(argv[1], "-tokens") == 0) {
|
||||
print_fn = print_token;
|
||||
} else {
|
||||
print_fn = print_value;
|
||||
}
|
||||
|
||||
lexer_t lex = {0};
|
||||
lexer_token_t token;
|
||||
error_t *err = lexer_open(&lex, filename);
|
||||
if (err) {
|
||||
puts(err->message);
|
||||
error_free(err);
|
||||
return 1;
|
||||
}
|
||||
|
||||
bool keep_going = true;
|
||||
while (keep_going && (err = lexer_next(&lex, &token)) == nullptr) {
|
||||
keep_going = print_fn(&token);
|
||||
free(token.value);
|
||||
}
|
||||
|
||||
if (err && err != err_eof) {
|
||||
puts(err->message);
|
||||
}
|
||||
error_free(err);
|
||||
return 0;
|
||||
}
|
9
tests/input/valid.asm
Normal file
9
tests/input/valid.asm
Normal file
@ -0,0 +1,9 @@
|
||||
_start:
|
||||
mov eax, 555 ; move 555 into eax
|
||||
push 0o777
|
||||
xor eax, 0xDEADBEEF
|
||||
and ecx, 0o770
|
||||
mov edx, 0b01010101
|
||||
push 0xffff:64
|
||||
push 0o777:16
|
||||
push 0b0001:16
|
18
validate.sh
Executable file
18
validate.sh
Executable file
@ -0,0 +1,18 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Start with static analysis
|
||||
scan-build -o reports/ -plist-html --status-bugs make clean all
|
||||
|
||||
# Run the sanitizer builds and valgrind
|
||||
make clean sanitize all
|
||||
|
||||
ARGUMENTS=("-tokens" "-text")
|
||||
while IFS= read -r INPUT_FILE; do
|
||||
for ARGS in ${ARGUMENTS[@]}; do
|
||||
./oas-asan $ARGS $INPUT_FILE > /dev/null
|
||||
./oas-msan $ARGS $INPUT_FILE > /dev/null
|
||||
valgrind --leak-check=full --error-exitcode=1 ./oas $ARGS $INPUT_FILE >/dev/null
|
||||
done
|
||||
done < <(find tests/input/ -type f -name '*.asm')
|
Loading…
x
Reference in New Issue
Block a user