commit d895fb0ec2da821721e23ce508c0ee2165fc3ec1
parent ec3e1757ac50cd9d692354ee5f7cb54bd50b75c5
Author: Ethan Long <edl@disroot.org>
Date: Sun, 18 Jun 2023 14:53:49 +1000
Gaming. Wrote some tests and completed the lexer
Still need to test cases for non-instruction tokens
Diffstat:
2 files changed, 474 insertions(+), 12 deletions(-)
diff --git a/implementations/C/src/remcc.c b/implementations/C/src/remcc.c
@@ -1,4 +1,6 @@
#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
@@ -45,29 +47,72 @@ typedef enum {
NEG
} conditional_t;
+typedef enum {
+ ARG,
+ TEMP,
+ RET
+} REG_TYPE;
+
+typedef struct {
+ REG_TYPE type;
+ uint64_t num;
+} reg_t;
+
+typedef enum {
+ REG,
+ IMM
+} OPER_TYPE;
+
+typedef struct {
+ OPER_TYPE type;
+ union {
+ reg_t reg;
+ uint64_t imm;
+ } val;
+} oper_t;
+
typedef struct {
opcode_t opcode;
conditional_t cond;
- uint64_t operand_1;
- uint64_t operand_2;
- uint64_t operand_3;
+ oper_t dest;
+ oper_t temp_1;
+ oper_t temp_2;
} inst_t;
-typedef union {
- opcode_t opcode;
- conditional_t cond;
- uint64_t operand;
+// The maximum number of characters representing a token
+#define MAX_TOK 256
+
+typedef enum {
+ OPCODE,
+ COND,
+ OPER,
+ ERR
+} TOKEN_TYPE;
+
+typedef enum {
+ LEX_ERROR
+} error_t;
+
+typedef struct {
+ TOKEN_TYPE type;
+ union {
+ opcode_t opcode;
+ conditional_t cond;
+ oper_t operand;
+ error_t error;
+ } val;
} token_t;
/* Function prototypes */
int usage(char *arg0);
-token_t *tokenise(FILE *stream);
+token_t *lexer(FILE *stream);
+token_t lex(char *tok);
inst_t *parse(token_t *tokens);
uint8_t *byte_compile(inst_t *instructions);
void write_bytecode(FILE *stream, uint8_t *bytecode);
/* Implementation: */
-int main(int argc, char **argv) {
+/*int main(int argc, char **argv) {
char *input_fname = NULL, *output_fname = NULL;
FILE *input_f = NULL, *output_f = NULL;
token_t *prog_tokens = NULL;
@@ -94,25 +139,44 @@ int main(int argc, char **argv) {
return 1;
}
- prog_tokens = tokenise(input_f);
+ prog_tokens = lexer(input_f);
prog_insts = parse(prog_tokens);
prog_bytecode = byte_compile(prog_insts);
write_bytecode(output_f, prog_bytecode);
return 0;
-}
+ }*/
int usage(char *arg0) {
fprintf(stderr, "Usage: %s input.rasm output.rin\n", arg0);
return 1;
}
-token_t *tokenise(FILE *stream) {
+token_t *lexer(FILE *stream) {
assert(NULL == "tokenise not yet implemented");
+
+ char buf[MAX_TOK] = {0};
+ size_t i, j = 0, tok_arr_size = 100;
+ token_t *tokens = calloc(tok_arr_size, sizeof(token_t));
+
+ while (!feof(stream)) {
+ for (i = 0; i < MAX_TOK && (buf[i] = getc(stream)) != ' '; i++);
+ buf[i] = '\0';
+
+ tokens[j] = lex(buf);
+
+ j++;
+ if (j >= tok_arr_size) {
+ tok_arr_size *= 2;
+ tokens = realloc(tokens, tok_arr_size);
+ }
+ }
+
return NULL;
}
+
inst_t *parse(token_t *tokens) {
assert(NULL == "parse not yet implemented");
return NULL;
@@ -126,3 +190,296 @@ uint8_t *byte_compile(inst_t *instructions) {
void write_bytecode(FILE *stream, uint8_t *bytecode) {
assert(NULL == "write_bytecode not yet implemented");
}
+
+// My homemade lexer, it's a bit filthy but it'll do for now
+token_t lex(char *tok_str) {
+ // TODO: Operands
+ // TODO: Case-Insensitive
+ token_t err = {
+ .type = ERR,
+ .val.error = LEX_ERROR,
+ };
+
+ switch (tok_str[0]) {
+ case 'a':
+ if (isdigit(tok_str[1])) {
+ unsigned long n = strtoul(tok_str + 1, NULL, 10);
+ if (errno) {
+ break;
+ }
+ return (token_t) {
+ .type = OPER,
+ .val.operand = {
+ .type = REG,
+ .val.reg = {
+ .type = ARG,
+ .num = n
+ }
+ }
+ };
+ } else if (strcmp(tok_str + 1, "dd") == 0) {
+ return (token_t) {
+ .type = OPCODE,
+ .val.opcode = ADD
+ };
+ } else if (strcmp(tok_str + 1, "nd") == 0) {
+ return (token_t) {
+ .type = OPCODE,
+ .val.opcode = AND
+ };
+ }
+ break;
+
+ case 'c':
+ // The only C instruction is call
+ if (strcmp(tok_str + 1, "all") == 0) {
+ return (token_t) {
+ .type = OPCODE,
+ .val.opcode = CALL
+ };
+ }
+ break;
+
+ case 'd':
+ // The only D instruction is div
+ if (strcmp(tok_str + 1, "iv") == 0) {
+ return (token_t) {
+ .type = OPCODE,
+ .val.opcode = DIV
+ };
+ }
+ break;
+
+ case 'e':
+ if (strcmp(tok_str + 1, "q") == 0) {
+ return (token_t) {
+ .type = COND,
+ .val.cond = EQ
+ };
+ }
+ break;
+
+ case 'g':
+ if (strcmp(tok_str + 1, "t") == 0) {
+ return (token_t) {
+ .type = COND,
+ .val.cond = GT
+ };
+ } else if (strcmp(tok_str + 1, "eq") == 0) {
+ return (token_t) {
+ .type = COND,
+ .val.cond = GEQ
+ };
+ }
+ break;
+
+ case 'j':
+ // The only J instruction is jump
+ if (strcmp(tok_str + 1, "ump") == 0) {
+ return (token_t) {
+ .type = OPCODE,
+ .val.opcode = JUMP
+ };
+ }
+ break;
+
+ case 'l':
+ // The only L instruction is load
+ if (strcmp(tok_str + 1, "t") == 0) {
+ return (token_t) {
+ .type = COND,
+ .val.cond = LT
+ };
+ } else if (strcmp(tok_str + 1, "eq") == 0) {
+ return (token_t) {
+ .type = COND,
+ .val.cond = LEQ
+ };
+ } else if (strcmp(tok_str + 1, "oad") == 0) {
+ return (token_t) {
+ .type = OPCODE,
+ .val.opcode = LOAD
+ };
+ }
+ break;
+
+ case 'm':
+ if (strcmp(tok_str + 1, "ul") == 0) {
+ return (token_t) {
+ .type = OPCODE,
+ .val.opcode = MUL
+ };
+ } else if (strcmp(tok_str + 1, "ove") == 0) {
+ return (token_t) {
+ .type = OPCODE,
+ .val.opcode = MOVE
+ };
+ }
+ break;
+
+ case 'n':
+ if (strcmp(tok_str + 1, "op") == 0) {
+ return (token_t) {
+ .type = OPCODE,
+ .val.opcode = NOP
+ };
+ } else if (strcmp(tok_str + 1, "ot") == 0) {
+ return (token_t) {
+ .type = OPCODE,
+ .val.opcode = NOT
+ };
+ } else if (strcmp(tok_str + 1, "eq") == 0) {
+ return (token_t) {
+ .type = COND,
+ .val.cond = NEQ
+ };
+ } else if (strcmp(tok_str + 1, "eg") == 0) {
+ return (token_t) {
+ .type = COND,
+ .val.cond = NEG
+ };
+ }
+ break;
+
+ case 'o':
+ if (strcmp(tok_str + 1, "r") == 0) {
+ return (token_t) {
+ .type = OPCODE,
+ .val.opcode = OR
+ };
+ }
+ break;
+
+ case 'p':
+ switch (tok_str[1]) {
+ case 'u':
+ if (strcmp(tok_str + 2, "sh") == 0) {
+ return (token_t) {
+ .type = OPCODE,
+ .val.opcode = PUSH
+ };
+ }
+ break;
+
+ case 'o':
+ if (strcmp(tok_str + 2, "p") == 0) {
+ return (token_t) {
+ .type = OPCODE,
+ .val.opcode = POP
+ };
+ } else if (strcmp(tok_str + 2, "s") == 0) {
+ return (token_t) {
+ .type = COND,
+ .val.cond = POS
+ };
+ }
+ break;
+
+ case 'e':
+ if (strcmp(tok_str + 2, "ek") == 0) {
+ return (token_t) {
+ .type = OPCODE,
+ .val.opcode = PEEK
+ };
+ }
+ break;
+
+ default:
+ break;
+ }
+ break;
+
+ case 'r':
+ if (strcmp(tok_str + 1, "eturn") == 0) {
+ return (token_t) {
+ .type = OPCODE,
+ .val.opcode = RETURN
+ };
+ } else if (isdigit(tok_str[1])) {
+ unsigned long n = strtoul(tok_str + 1, NULL, 10);
+ if (errno) {
+ break;
+ }
+ return (token_t) {
+ .type = OPER,
+ .val.operand = {
+ .type = REG,
+ .val.reg = {
+ .type = RET,
+ .num = n
+ }
+ }
+ };
+ }
+ break;
+
+ case 's':
+ // FIXME: Filthy
+ if (strcmp(tok_str + 1, "ub") == 0) {
+ return (token_t) {
+ .type = OPCODE,
+ .val.opcode = SUB
+ };
+ } else if (strcmp(tok_str + 1, "wap") == 0) {
+ return (token_t) {
+ .type = OPCODE,
+ .val.opcode = SWAP
+ };
+ } else if (strcmp(tok_str + 1, "tore") == 0) {
+ return (token_t) {
+ .type = OPCODE,
+ .val.opcode = STORE
+ };
+ } else if (strcmp(tok_str + 1, "hiftl") == 0) {
+ return (token_t) {
+ .type = OPCODE,
+ .val.opcode = SHIFTL
+ };
+ } else if (strcmp(tok_str + 1, "hiftr(l)") == 0) {
+ return (token_t) {
+ .type = OPCODE,
+ .val.opcode = SHIFTR_L
+ };
+ } else if (strcmp(tok_str + 1, "hiftr(a)") == 0) {
+ return (token_t) {
+ .type = OPCODE,
+ .val.opcode = SHIFTR_A
+ };
+ }
+ break;
+
+ case 't':
+ if (isdigit(tok_str[1])) {
+ unsigned long n = strtoul(tok_str + 1, NULL, 10);
+ if (errno) {
+ break;
+ }
+ return (token_t) {
+ .type = OPER,
+ .val.operand = {
+ .type = REG,
+ .val.reg = {
+ .type = TEMP,
+ .num = n
+ }
+ }
+ };
+ }
+ break;
+
+ case 'x':
+ if (strcmp(tok_str + 1, "or") == 0) {
+ return (token_t) {
+ .type = OPCODE,
+ .val.opcode = XOR
+ };
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ fprintf(stderr, "Unknown token: %s\n", tok_str);
+ return err;
+}
diff --git a/implementations/C/tests/lextest.c b/implementations/C/tests/lextest.c
@@ -0,0 +1,105 @@
+// Test if the assembler lexer is functioning as expected
+#import "../src/remcc.c"
+
+typedef enum {
+ PASS,
+ FAIL
+} RESULT;
+
+typedef struct {
+ RESULT state;
+ union {
+ char *result;
+ char *error;
+ } val;
+} result_t;
+
+result_t test_lex(char *tok, opcode_t expect);
+result_t test_lexer(FILE *stream);
+
+int main(int argc, char **argv) {
+ result_t result;
+
+ char *lex_tests[] = {
+ // Instructions tests
+ "nop",
+ "add",
+ "sub",
+ "mul",
+ "div",
+ "and",
+ "or",
+ "xor",
+ "not",
+ "shiftl",
+ "shiftr(l)",
+ "shiftr(a)",
+ "move",
+ "swap",
+ "push",
+ "pop",
+ "peek",
+ "load",
+ "store",
+ "jump",
+ "call",
+ "return",
+ NULL
+ };
+ opcode_t lex_expects[] = {
+ NOP,
+ ADD,
+ SUB,
+ MUL,
+ DIV,
+ AND,
+ OR,
+ XOR,
+ NOT,
+ SHIFTL,
+ SHIFTR_L,
+ SHIFTR_A,
+ MOVE,
+ SWAP,
+ PUSH,
+ POP,
+ PEEK,
+ LOAD,
+ STORE,
+ JUMP,
+ CALL,
+ RETURN
+ };
+
+ //FILE *stream_tests[] = {0};
+
+ for (int i = 0; lex_tests[i] != NULL; i++) {
+ switch ((result = test_lex(lex_tests[i], lex_expects[i])).state) {
+ case PASS:
+ printf("We have a success!\n");
+ printf("Result: %s\n", result.val.result);
+ break;
+ case FAIL:
+ fprintf(stderr, "Dumbledore dies\n");
+ fprintf(stderr, "Error: %s\n", result.val.error);
+ break;
+ }
+ }
+ //test_lexer();
+ return 0;
+}
+
+result_t test_lex(char *tok, opcode_t expect) {
+ token_t res = lex(tok);
+ if (res.type == OPCODE && res.val.opcode == expect) {
+ return (result_t) {
+ .state = PASS,
+ .val.result = "Success!"
+ };
+ } else {
+ return (result_t) {
+ .state = FAIL,
+ .val.error = "Didn't get what we expected!"
+ };
+ }
+}