Gaming. Wrote some tests and completed the lexer Still need to test cases for non-instruction tokens - remedyvm - A toy RISC virtual machine inspired by Bell Lab's `dis' and Tsoding's `bm'

commit d895fb0ec2da821721e23ce508c0ee2165fc3ec1
parent ec3e1757ac50cd9d692354ee5f7cb54bd50b75c5
Author: Ethan Long <edl@disroot.org>
Date:   Sun, 18 Jun 2023 14:53:49 +1000

Gaming. Wrote some tests and completed the lexer
Still need to test cases for non-instruction tokens

Diffstat:
M implementations/C/src/remcc.c  | 381 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
A implementations/C/tests/lextest.c  | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

2 files changed, 474 insertions(+), 12 deletions(-)
diff --git a/implementations/C/src/remcc.c b/implementations/C/src/remcc.c
@@ -1,4 +1,6 @@
 #include <assert.h>
+#include <ctype.h>
+#include <errno.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -45,29 +47,72 @@ typedef enum {
   NEG
 } conditional_t;
 
+typedef enum {
+  ARG,
+  TEMP,
+  RET
+} REG_TYPE;
+
+typedef struct {
+  REG_TYPE type;
+  uint64_t num;
+} reg_t;
+
+typedef enum {
+  REG,
+  IMM
+} OPER_TYPE;
+
+typedef struct {
+  OPER_TYPE type;
+  union {
+    reg_t reg;
+    uint64_t imm;
+  } val;
+} oper_t;
+
 typedef struct {
   opcode_t opcode;
   conditional_t cond;
-  uint64_t operand_1;
-  uint64_t operand_2;
-  uint64_t operand_3;
+  oper_t dest;
+  oper_t temp_1;
+  oper_t temp_2;
 } inst_t;
 
-typedef union {
-  opcode_t opcode;
-  conditional_t cond;
-  uint64_t operand;
+// The maximum number of characters representing a token
+#define MAX_TOK 256
+
+typedef enum {
+  OPCODE,
+  COND,
+  OPER,
+  ERR
+} TOKEN_TYPE;
+
+typedef enum {
+  LEX_ERROR
+} error_t;
+
+typedef struct {
+  TOKEN_TYPE type;
+  union {
+    opcode_t opcode;
+    conditional_t cond;
+    oper_t operand;
+    error_t error;
+  } val;
 } token_t;
 
 /* Function prototypes */
 int usage(char *arg0);
-token_t *tokenise(FILE *stream);
+token_t *lexer(FILE *stream);
+token_t lex(char *tok);
 inst_t *parse(token_t *tokens);
 uint8_t *byte_compile(inst_t *instructions);
 void write_bytecode(FILE *stream, uint8_t *bytecode);
 
 /* Implementation: */
-int main(int argc, char **argv) {
+/*int main(int argc, char **argv) {
   char *input_fname = NULL, *output_fname = NULL;
   FILE *input_f = NULL, *output_f = NULL;
   token_t *prog_tokens = NULL;
@@ -94,25 +139,44 @@ int main(int argc, char **argv) {
     return 1;
   }
 
-  prog_tokens = tokenise(input_f);
+  prog_tokens = lexer(input_f);
   prog_insts = parse(prog_tokens);
   prog_bytecode = byte_compile(prog_insts);
 
   write_bytecode(output_f, prog_bytecode);
 
   return 0;
-}
+  }*/
 
 int usage(char *arg0) {
   fprintf(stderr, "Usage: %s input.rasm output.rin\n", arg0);
   return 1;
 }
 
-token_t *tokenise(FILE *stream) {
+token_t *lexer(FILE *stream) {
   assert(NULL == "tokenise not yet implemented");
+
+  char buf[MAX_TOK] = {0};
+  size_t i, j = 0, tok_arr_size = 100;
+  token_t *tokens = calloc(tok_arr_size, sizeof(token_t));
+
+  while (!feof(stream)) {
+    for (i = 0; i < MAX_TOK && (buf[i] = getc(stream)) != ' '; i++);
+    buf[i] = '\0';
+
+    tokens[j] = lex(buf);
+
+    j++;
+    if (j >= tok_arr_size) {
+      tok_arr_size *= 2;
+      tokens = realloc(tokens, tok_arr_size);
+    }
+  }
+
   return NULL;
 }
 
+
 inst_t *parse(token_t *tokens) {
   assert(NULL == "parse not yet implemented");
   return NULL;
@@ -126,3 +190,296 @@ uint8_t *byte_compile(inst_t *instructions) {
 void write_bytecode(FILE *stream, uint8_t *bytecode) {
   assert(NULL == "write_bytecode not yet implemented");
 }
+
+// My homemade lexer, it's a bit filthy but it'll do for now
+token_t lex(char *tok_str) {
+  // TODO: Operands
+  // TODO: Case-Insensitive
+  token_t err = {
+    .type = ERR,
+    .val.error = LEX_ERROR,
+  };
+
+  switch (tok_str[0]) {
+  case 'a':
+    if (isdigit(tok_str[1])) {
+      unsigned long n = strtoul(tok_str + 1, NULL, 10);
+      if (errno) {
+	break;
+      }
+      return (token_t) {
+	.type = OPER,
+	.val.operand = {
+	  .type = REG,
+	  .val.reg = {
+	    .type = ARG,
+	    .num = n
+	  }
+	}
+      };
+    } else if (strcmp(tok_str + 1, "dd") == 0) {
+      return (token_t) {
+	.type = OPCODE,
+	.val.opcode = ADD
+      };
+    } else if (strcmp(tok_str + 1, "nd") == 0) {
+      return (token_t) {
+	.type = OPCODE,
+	.val.opcode = AND
+      };
+    }
+    break;
+
+  case 'c':
+    // The only C instruction is call
+    if (strcmp(tok_str + 1, "all") == 0) {
+      return (token_t) {
+	.type = OPCODE,
+	.val.opcode = CALL
+      };
+    }
+    break;
+
+  case 'd':
+    // The only D instruction is div
+    if (strcmp(tok_str + 1, "iv") == 0) {
+      return (token_t) {
+	.type = OPCODE,
+	.val.opcode = DIV
+      };
+    }
+    break;
+
+  case 'e':
+    if (strcmp(tok_str + 1, "q") == 0) {
+      return (token_t) {
+	.type = COND,
+	.val.cond = EQ
+      };
+    }
+    break;
+
+  case 'g':
+    if (strcmp(tok_str + 1, "t") == 0) {
+      return (token_t) {
+	.type = COND,
+	.val.cond = GT
+      };
+    } else if (strcmp(tok_str + 1, "eq") == 0) {
+      return (token_t) {
+	.type = COND,
+	.val.cond = GEQ
+      };
+    }
+    break;
+
+  case 'j':
+    // The only J instruction is jump
+    if (strcmp(tok_str + 1, "ump") == 0) {
+      return (token_t) {
+	.type = OPCODE,
+	.val.opcode = JUMP
+      };
+    }
+    break;
+    
+  case 'l':
+    // The only L instruction is load
+    if (strcmp(tok_str + 1, "t") == 0) {
+      return (token_t) {
+	.type = COND,
+	.val.cond = LT
+      };
+    } else if (strcmp(tok_str + 1, "eq") == 0) {
+      return (token_t) {
+	.type = COND,
+	.val.cond = LEQ
+      };
+    } else if (strcmp(tok_str + 1, "oad") == 0) {
+      return (token_t) {
+	.type = OPCODE,
+	.val.opcode = LOAD
+      };
+    }
+    break;
+
+  case 'm':
+    if (strcmp(tok_str + 1, "ul") == 0) {
+      return (token_t) {
+	.type = OPCODE,
+	.val.opcode = MUL
+      };
+    } else if (strcmp(tok_str + 1, "ove") == 0) {
+      return (token_t) {
+	.type = OPCODE,
+	.val.opcode = MOVE
+      };
+    }
+    break;
+    
+  case 'n':
+    if (strcmp(tok_str + 1, "op") == 0) {
+      return (token_t) {
+	.type = OPCODE,
+	.val.opcode = NOP
+      };
+    } else if (strcmp(tok_str + 1, "ot") == 0) {
+      return (token_t) {
+	.type = OPCODE,
+	.val.opcode = NOT
+      };
+    } else if (strcmp(tok_str + 1, "eq") == 0) {
+      return (token_t) {
+	.type = COND,
+	.val.cond = NEQ
+      };
+    } else if (strcmp(tok_str + 1, "eg") == 0) {
+      return (token_t) {
+	.type = COND,
+	.val.cond = NEG
+      };
+    }
+    break;
+    
+  case 'o':
+    if (strcmp(tok_str + 1, "r") == 0) {
+      return (token_t) {
+	.type = OPCODE,
+	.val.opcode = OR
+      };
+    }
+    break;
+    
+  case 'p':
+    switch (tok_str[1]) {
+    case 'u':
+      if (strcmp(tok_str + 2, "sh") == 0) {
+	return (token_t) {
+	  .type = OPCODE,
+	  .val.opcode = PUSH
+	};
+      }
+      break;
+      
+    case 'o':
+      if (strcmp(tok_str + 2, "p") == 0) {
+	return (token_t) {
+	  .type = OPCODE,
+	  .val.opcode = POP
+	};
+      } else if (strcmp(tok_str + 2, "s") == 0) {
+	return (token_t) {
+	  .type = COND,
+	  .val.cond = POS
+	};
+      }
+      break;
+
+    case 'e':
+      if (strcmp(tok_str + 2, "ek") == 0) {
+	return (token_t) {
+	  .type = OPCODE,
+	  .val.opcode = PEEK
+	};
+      }
+      break;
+
+    default:
+      break;
+    }
+    break;
+    
+  case 'r':
+    if (strcmp(tok_str + 1, "eturn") == 0) {
+      return (token_t) {
+	.type = OPCODE,
+	.val.opcode = RETURN
+      };
+    } else if (isdigit(tok_str[1])) {
+      unsigned long n = strtoul(tok_str + 1, NULL, 10);
+      if (errno) {
+	break;
+      }
+      return (token_t) {
+	.type = OPER,
+	.val.operand = {
+	  .type = REG,
+	  .val.reg = {
+	    .type = RET,
+	    .num = n
+	  }
+	}
+      };
+    }
+    break;
+    
+  case 's':
+    // FIXME: Filthy
+    if (strcmp(tok_str + 1, "ub") == 0) {
+      return (token_t) {
+	.type = OPCODE,
+	.val.opcode = SUB
+      };
+    } else if (strcmp(tok_str + 1, "wap") == 0) {
+      return (token_t) {
+	.type = OPCODE,
+	.val.opcode = SWAP
+      };
+    } else if (strcmp(tok_str + 1, "tore") == 0) {
+      return (token_t) {
+	.type = OPCODE,
+	.val.opcode = STORE
+      };
+    } else if (strcmp(tok_str + 1, "hiftl") == 0) {
+      return (token_t) {
+	.type = OPCODE,
+	.val.opcode = SHIFTL
+      };
+    } else if (strcmp(tok_str + 1, "hiftr(l)") == 0) {
+      return (token_t) {
+	.type = OPCODE,
+	.val.opcode = SHIFTR_L
+      };
+    } else if (strcmp(tok_str + 1, "hiftr(a)") == 0) {
+      return (token_t) {
+	.type = OPCODE,
+	.val.opcode = SHIFTR_A
+      };
+    }
+    break;
+
+  case 't':
+    if (isdigit(tok_str[1])) {
+      unsigned long n = strtoul(tok_str + 1, NULL, 10);
+      if (errno) {
+	break;
+      }
+      return (token_t) {
+	.type = OPER,
+	.val.operand = {
+	  .type = REG,
+	  .val.reg = {
+	    .type = TEMP,
+	    .num = n
+	  }
+	}
+      };
+    }
+    break;
+    
+  case 'x':
+    if (strcmp(tok_str + 1, "or") == 0) {
+      return (token_t) {
+	.type = OPCODE,
+	.val.opcode = XOR
+      };
+    }
+    break;
+
+  default:
+    break;
+  }
+
+  fprintf(stderr, "Unknown token: %s\n", tok_str);
+  return err;
+}
diff --git a/implementations/C/tests/lextest.c b/implementations/C/tests/lextest.c
@@ -0,0 +1,105 @@
+// Test if the assembler lexer is functioning as expected
+#import "../src/remcc.c"
+
+typedef enum {
+  PASS,
+  FAIL
+} RESULT;
+
+typedef struct {
+  RESULT state;
+  union {
+    char *result;
+    char *error;
+  } val;
+} result_t;
+
+result_t test_lex(char *tok, opcode_t expect);
+result_t test_lexer(FILE *stream);
+
+int main(int argc, char **argv) {
+  result_t result;
+
+  char *lex_tests[] = {
+    // Instructions tests
+    "nop",
+    "add",
+    "sub",
+    "mul",
+    "div",
+    "and",
+    "or",
+    "xor",
+    "not",
+    "shiftl",
+    "shiftr(l)",
+    "shiftr(a)",
+    "move",
+    "swap",
+    "push",
+    "pop",
+    "peek",
+    "load",
+    "store",
+    "jump",
+    "call",
+    "return",
+    NULL
+  };
+  opcode_t lex_expects[] = {
+    NOP,
+    ADD,
+    SUB,
+    MUL,
+    DIV,
+    AND,
+    OR,
+    XOR,
+    NOT,
+    SHIFTL,
+    SHIFTR_L,
+    SHIFTR_A,
+    MOVE,
+    SWAP,
+    PUSH,
+    POP,
+    PEEK,
+    LOAD,
+    STORE,
+    JUMP,
+    CALL,
+    RETURN
+  };
+
+  //FILE *stream_tests[] = {0};
+
+  for (int i = 0; lex_tests[i] != NULL; i++) {
+    switch ((result = test_lex(lex_tests[i], lex_expects[i])).state) {
+    case PASS:
+      printf("We have a success!\n");
+      printf("Result: %s\n", result.val.result);
+      break;
+    case FAIL:
+      fprintf(stderr, "Dumbledore dies\n");
+      fprintf(stderr, "Error: %s\n", result.val.error);
+      break;
+    }
+  }
+  //test_lexer();
+  return 0;
+}
+
+result_t test_lex(char *tok, opcode_t expect) {
+  token_t res = lex(tok);
+  if (res.type == OPCODE && res.val.opcode == expect) {
+    return (result_t) {
+      .state = PASS,
+      .val.result = "Success!"
+    };
+  } else {
+    return (result_t) {
+      .state = FAIL,
+      .val.error = "Didn't get what we expected!"
+    };
+  }
+}

	remedyvm A toy RISC virtual machine inspired by Bell Lab's `dis' and Tsoding's `bm'
	git clone git://git.ethandl.dev/remedyvm
	Log \| Files \| Refs

M	implementations/C/src/remcc.c	\|	381	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
A	implementations/C/tests/lextest.c	\|	105	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++