A scanner to break up a code input into LLVM Tokens though Lexical Analysis When you make the lexer, you want to define all:

  • Enum for Reserved Keywords (like def, if, func, etc…)
  • Enum for Identifiers (for variables, functions, classnames, etc)
  • Enum for Literals (for numbers, for chars, etc..)
  • Enum for Whitespace (like , \n, etc..)
  • A variable to store identifier token’s name
  • A variable to store a literal’s value
  • A function to grab the next token
// The lexer returns tokens [0-255] if it is an unknown character, otherwise one
// of these for known things.
enum Token {
  tok_eof = -1,
 
  // commands
  tok_def = -2,
  tok_extern = -3,
 
  // primary
  tok_identifier = -4,
  tok_number = -5,
};
 
static std::string IdentifierStr; // Filled in if tok_identifier
static double NumVal;             // Filled in if tok_number
 
// gettok - Return the next token from standard input.
static int gettok() {
  static int LastChar = ' ';
  
  // Skip any whitespace, until you reach the first non-whitespace token
  while (isspace(LastChar))
    LastChar = getchar();
 
  if (isalpha(LastChar)) { // identifier: [a-zA-Z][a-zA-Z0-9]* 
    // grabbing the entire token string
    IdentifierStr = LastChar;
    while (isalnum((LastChar = getchar())))
      IdentifierStr += LastChar;
 
    // recognizing token 
    if (IdentifierStr == "def")
      return tok_def;
    if (IdentifierStr == "extern")
      return tok_extern;
 
    return tok_identifier;
  }
  
  // if we are given a numerical character
  if (isdigit(LastChar) || LastChar == '.') {   // Number: [0-9.]+
    // create a string to hold the number with its decimal places
    std::string NumStr;
    do {
      NumStr += LastChar;
      LastChar = getchar();
    } while (isdigit(LastChar) || LastChar == '.');
  
    NumVal = strtod(NumStr.c_str(), 0);
    return tok_number;
  }
  // If we are given a comment
  if (LastChar == '#') {
  // Comment until end of line.
  do
    LastChar = getchar();
  while (LastChar != EOF && LastChar != '\n' && LastChar != '\r');
 
  if (LastChar != EOF)
    return gettok();
  }
 
  // Check for end of file.  Don't eat the EOF.
  if (LastChar == EOF)
    return tok_eof;
 
  // Otherwise, just return the character as its ascii value.
  int ThisChar = LastChar;
  LastChar = getchar();
  return ThisChar;
 
}
 

If any token is not defined, the lexer shall return as its ascii code instead.