Compare commits

...

3 Commits

Author SHA1 Message Date
5cdb60d395 Remove peek function 2025-03-30 22:51:47 +02:00
e5830daac9 Add documentation comments to the lexer code 2025-03-30 22:51:15 +02:00
4becfb868e Reduce excessive line length in Makefile 2025-03-30 22:07:35 +02:00
3 changed files with 149 additions and 16 deletions

View File

@ -25,8 +25,12 @@ fuzz:
afl-fuzz -i tests/input -o reports/afl -m none -- ./oas-afl -tokens @@
sanitize:
make CFLAGS="$(CFLAGS) -fsanitize=address,undefined" LDFLAGS="-fsanitize=address,undefined" TARGET="oas-asan" clean-objects all
make CFLAGS="$(CFLAGS) -fsanitize=memory -fsanitize-memory-track-origins=2" LDFLAGS="-fsanitize=memory -fsanitize-memory-track-origins=2" TARGET="oas-msan" clean-objects all
make CFLAGS="$(CFLAGS) -fsanitize=address,undefined" \
LDFLAGS="-fsanitize=address,undefined" \
TARGET="oas-asan" clean-objects all
make CFLAGS="$(CFLAGS) -fsanitize=memory -fsanitize-memory-track-origins=2" \
LDFLAGS="-fsanitize=memory -fsanitize-memory-track-origins=2" \
TARGET="oas-msan" clean-objects all
make clean-objects
validate:

View File

@ -89,6 +89,15 @@ void lexer_close(lexer_t *lex) {
memset(lex, 0, sizeof(lexer_t));
}
/**
* Attempts to fill the lexer's internal buffer with more data from the file.
* Only reads data if the buffer isn't already full and the file hasn't reached
* EOF.
*
* @param lex The lexer to fill the buffer for
* @return nullptr on success, an error otherwise (including err_eof if EOF
* reached with empty buffer)
*/
error_t *lexer_fill_buffer(lexer_t *lex) {
if (feof(lex->fp) && lex->buffer_count == 0)
return err_eof;
@ -126,25 +135,28 @@ error_t *lexer_open(lexer_t *lex, char *path) {
return nullptr;
}
/**
* Shifts the lexer's buffer by n characters, discarding the first n characters
* and moving the remaining characters to the beginning of the buffer.
*
* @param lex The lexer whose buffer to shift
* @param n Number of characters to shift out
*
* @pre There must be at least n characters in the input buffer
*/
void lexer_shift_buffer(lexer_t *lex, int n) {
assert(lex->buffer_count >= n);
lex->buffer_count -= n;
memmove(lex->buffer, lex->buffer + n, lex->buffer_count);
}
error_t *lexer_peek(lexer_t *lex, char *c) {
error_t *err = lexer_fill_buffer(lex);
if (err)
return err;
if (lex->buffer_count == 0)
return err_eof;
*c = lex->buffer[0];
lexer_shift_buffer(lex, 1);
return nullptr;
}
// This does _not_ fill the internal lexer buffer and you _must_ call
// lexer_fill_buffer() before calling this. It will always return false if your
// prefix is larger than lexer_buffer_size
/**
* Checks if the lexer's buffer starts with the given prefix.
*
* @param lex The lexer to check
* @param prefix The string prefix to check for
* @return true if the buffer starts with the prefix, false otherwise
*/
bool lexer_has_prefix(lexer_t *lex, char *prefix) {
size_t len = strlen(prefix);
if (len > lex->buffer_count)
@ -159,6 +171,17 @@ error_t *lexer_not_implemented(lexer_t *lex, lexer_token_t *token) {
lex->character_number);
}
/**
* Consumes exactly n characters from the buffer into the provided output
* buffer.
*
* @param lex The lexer to consume from
* @param len Size of the output buffer
* @param buffer Output buffer to store the consumed characters
* @param n Number of characters to consume
* @return nullptr on success, an error otherwise (err_buffer_underrun if buffer
* contains fewer than n characters)
*/
error_t *lexer_consume_n(lexer_t *lex, const size_t len,
char buffer[static len], const size_t n) {
if (lex->buffer_count < n)
@ -170,6 +193,20 @@ error_t *lexer_consume_n(lexer_t *lex, const size_t len,
lexer_shift_buffer(lex, n);
return nullptr;
}
/**
* Consumes characters from the lexer buffer that satisfy the predicate
* function. Will attempt to refill the buffer if more valid characters are
* available.
*
* @param lex The lexer to consume from
* @param n Maximum number of characters to consume
* @param buffer Output buffer to store consumed characters
* @param is_valid Function that determines if a character should be consumed
* @param n_consumed Output parameter that will contain the number of characters
* consumed
* @return nullptr on success, an error otherwise
*/
error_t *lexer_consume(lexer_t *lex, const size_t n, char buffer[static n],
char_predicate_t is_valid, size_t *n_consumed) {
const size_t buffer_size = n;
@ -217,6 +254,18 @@ bool is_decimal_character(char c) {
return isdigit(c);
}
/**
* Processes a number token (decimal, hexadecimal, octal, or binary).
* Handles number formats with optional size suffixes.
*
* @param lex The lexer to read from
* @param token Output parameter that will be populated with the token
* information
* @return nullptr on success, an error otherwise
*
* @pre There must be at least one character in the input buffer and it should
* be [0-9]
*/
error_t *lexer_next_number(lexer_t *lex, lexer_token_t *token) {
constexpr size_t max_number_length = 128;
size_t so_far = 0;
@ -294,6 +343,19 @@ error_t *lexer_next_number(lexer_t *lex, lexer_token_t *token) {
token->value = strdup(buffer);
return nullptr;
}
/**
* Processes a newline token (\n or \r\n).
* Updates the lexer's line and character position tracking.
*
* @param lex The lexer to read from
* @param token Output parameter that will be populated with the token
* information
* @return nullptr on success, an error otherwise
*
* @pre There must be at least on character in the input buffer and it must
* be [\r\n]
*/
error_t *lexer_next_newline(lexer_t *lex, lexer_token_t *token) {
token->line_number = lex->line_number;
token->character_number = lex->character_number;
@ -323,6 +385,19 @@ bool is_identifier_character(char c) {
return isalnum(c) || c == '_';
}
/**
* Processes an identifier token.
* Identifiers start with a letter or underscore and can contain alphanumeric
* characters or underscores.
*
* @param lex The lexer to read from
* @param token Output parameter that will be populated with the token
* information
* @return nullptr on success, an error otherwise
*
* @pre There must be at least 1 character in the read buffer and it must be
* [a-zA-Z_]
*/
error_t *lexer_next_identifier(lexer_t *lex, lexer_token_t *token) {
constexpr size_t max_identifier_length = 128;
size_t n = 0;
@ -355,6 +430,17 @@ bool is_whitespace_character(char c) {
return c == ' ' || c == '\t';
}
/**
* Processes a whitespace token (spaces and tabs).
*
* @param lex The lexer to read from
* @param token Output parameter that will be populated with the token
* information
* @return nullptr on success, an error otherwise
*
* @pre There must be at least one character in the buffer and it must be
* [ \t]
*/
error_t *lexer_next_whitespace(lexer_t *lex, lexer_token_t *token) {
constexpr size_t max_whitespace_length = 1024;
size_t n = 0;
@ -380,6 +466,16 @@ bool is_comment_character(char c) {
return c != '\r' && c != '\n';
}
/**
* Processes a comment token (starts with ';' and continues to end of line).
*
* @param lex The lexer to read from
* @param token Output parameter that will be populated with the token
* information
* @return nullptr on success, an error otherwise
*
* @pre There must be at least one character in the buffer and it must be ';'
*/
error_t *lexer_next_comment(lexer_t *lex, lexer_token_t *token) {
constexpr size_t max_comment_length = 1024;
size_t n = 0;

View File

@ -47,10 +47,43 @@ typedef struct lexer {
FILE *fp;
} lexer_t;
/**
* @brief Closes a lexer and releases associated resources
*
* @param lex Pointer to the lexer to close
*/
void lexer_close(lexer_t *lex);
/**
* @brief Opens a file for lexical analysis
*
* @param lex Pointer to the lexer to initialize
* @param path Path to the file to open
* @return error_t* nullptr on success, or error describing the failure
*/
error_t *lexer_open(lexer_t *lex, char *path);
/**
* @brief Reads the next token from the input stream
*
* @param lex Pointer to an initialized lexer
* @param token Pointer to a token structure to fill with the next token
* @return error_t* nullptr on success, err_eof at end of file, or other error
*/
error_t *lexer_next(lexer_t *lex, lexer_token_t *token);
/**
* @brief Prints a token to stdout for debugging purposes
*
* @param token Pointer to the token to print
*/
void lexer_token_print(lexer_token_t *token);
/**
* @brief Frees any resources associated with a token
*
* @param token Pointer to the token to clean up
*/
void lexer_token_cleanup(lexer_token_t *token);
#endif // INCLUDE_SRC_LEXER_H_