""" BlackRoad OS Language Lexer Transforms source code into tokens for parsing """ import re from enum import Enum, auto from dataclasses import dataclass from typing import List, Optional class TokenType(Enum): """All possible token types in BlackRoad language""" # Literals INTEGER = auto() FLOAT = auto() STRING = auto() BOOLEAN = auto() COLOR = auto() # Identifiers & Keywords IDENTIFIER = auto() # Keywords - Control Flow IF = auto() ELIF = auto() ELSE = auto() MATCH = auto() FOR = auto() WHILE = auto() BREAK = auto() CONTINUE = auto() RETURN = auto() # Keywords - Declarations LET = auto() VAR = auto() CONST = auto() FUN = auto() ASYNC = auto() TYPE = auto() MODULE = auto() IMPORT = auto() FROM = auto() EXPORT = auto() # Keywords - 3D/Spatial SPACE = auto() CUBE = auto() SPHERE = auto() PLANE = auto() LIGHT = auto() CAMERA = auto() RENDER = auto() # Keywords - Concurrency SPAWN = auto() AWAIT = auto() CHAN = auto() SELECT = auto() CASE = auto() # Keywords - Memory MUT = auto() # Keywords - Metaprogramming MACRO = auto() COMPTIME = auto() ASM = auto() # Types INT = auto() FLOAT_TYPE = auto() STRING_TYPE = auto() BOOL_TYPE = auto() BYTE = auto() CHAR = auto() VEC2 = auto() VEC3 = auto() VEC4 = auto() COLOR_TYPE = auto() LIST = auto() DICT = auto() SET = auto() ANY = auto() # Operators PLUS = auto() # + MINUS = auto() # - STAR = auto() # * SLASH = auto() # / PERCENT = auto() # % POWER = auto() # ** ASSIGN = auto() # = PLUS_ASSIGN = auto() # += MINUS_ASSIGN = auto() # -= STAR_ASSIGN = auto() # *= SLASH_ASSIGN = auto() # /= EQ = auto() # == NE = auto() # != LT = auto() # < GT = auto() # > LE = auto() # <= GE = auto() # >= AND = auto() # and OR = auto() # or NOT = auto() # not AMPERSAND = auto() # & PIPE = auto() # | CARET = auto() # ^ TILDE = auto() # ~ # Delimiters LPAREN = auto() # ( RPAREN = auto() # ) LBRACKET = auto() # [ RBRACKET = auto() # ] LBRACE = auto() # { RBRACE = auto() # } COLON = auto() # : SEMICOLON = auto() # ; COMMA = auto() # , DOT = auto() # . ARROW = auto() # -> DOUBLE_DOT = auto() # .. TRIPLE_DOT = auto() # ... AT = auto() # @ HASH = auto() # # QUESTION = auto() # ? DOLLAR = auto() # $ # Special NEWLINE = auto() INDENT = auto() DEDENT = auto() EOF = auto() # HTML-like tags for 3D LT_SLASH = auto() # @dataclass class Token: """Represents a single token""" type: TokenType value: any line: int column: int def __repr__(self): return f"Token({self.type.name}, {self.value!r}, {self.line}:{self.column})" class Lexer: """BlackRoad OS Language Lexer""" KEYWORDS = { # Control flow 'if': TokenType.IF, 'elif': TokenType.ELIF, 'else': TokenType.ELSE, 'match': TokenType.MATCH, 'for': TokenType.FOR, 'while': TokenType.WHILE, 'break': TokenType.BREAK, 'continue': TokenType.CONTINUE, 'return': TokenType.RETURN, # Declarations 'let': TokenType.LET, 'var': TokenType.VAR, 'const': TokenType.CONST, 'fun': TokenType.FUN, 'async': TokenType.ASYNC, 'type': TokenType.TYPE, 'module': TokenType.MODULE, 'import': TokenType.IMPORT, 'from': TokenType.FROM, 'export': TokenType.EXPORT, # 3D/Spatial 'space': TokenType.SPACE, 'cube': TokenType.CUBE, 'sphere': TokenType.SPHERE, 'plane': TokenType.PLANE, 'light': TokenType.LIGHT, 'camera': TokenType.CAMERA, 'render': TokenType.RENDER, # Concurrency 'spawn': TokenType.SPAWN, 'await': TokenType.AWAIT, 'chan': TokenType.CHAN, 'select': TokenType.SELECT, 'case': TokenType.CASE, # Memory 'mut': TokenType.MUT, # Metaprogramming 'macro': TokenType.MACRO, 'comptime': TokenType.COMPTIME, 'asm': TokenType.ASM, # Boolean literals 'true': TokenType.BOOLEAN, 'false': TokenType.BOOLEAN, # Logical operators 'and': TokenType.AND, 'or': TokenType.OR, 'not': TokenType.NOT, # Types 'int': TokenType.INT, 'float': TokenType.FLOAT_TYPE, 'string': TokenType.STRING_TYPE, 'bool': TokenType.BOOL_TYPE, 'byte': TokenType.BYTE, 'char': TokenType.CHAR, 'vec2': TokenType.VEC2, 'vec3': TokenType.VEC3, 'vec4': TokenType.VEC4, 'color': TokenType.COLOR_TYPE, 'list': TokenType.LIST, 'dict': TokenType.DICT, 'set': TokenType.SET, 'any': TokenType.ANY, } def __init__(self, source: str, filename: str = ""): self.source = source self.filename = filename self.pos = 0 self.line = 1 self.column = 1 self.tokens: List[Token] = [] self.indent_stack = [0] # Track indentation levels def current_char(self) -> Optional[str]: """Get current character without advancing""" if self.pos >= len(self.source): return None return self.source[self.pos] def peek_char(self, offset: int = 1) -> Optional[str]: """Look ahead at character""" pos = self.pos + offset if pos >= len(self.source): return None return self.source[pos] def advance(self) -> Optional[str]: """Move to next character""" char = self.current_char() if char == '\n': self.line += 1 self.column = 1 else: self.column += 1 self.pos += 1 return char def skip_whitespace(self, skip_newlines: bool = False): """Skip whitespace (but preserve newlines for indentation unless skip_newlines=True)""" while self.current_char() in ' \t\r' or (skip_newlines and self.current_char() == '\n'): self.advance() def skip_comment(self): """Skip single-line comments starting with #""" if self.current_char() == '#': # Check for multi-line comment #[ ]# if self.peek_char() == '[': self.advance() # # self.advance() # [ while True: if self.current_char() is None: raise SyntaxError(f"Unterminated multi-line comment at {self.line}:{self.column}") if self.current_char() == ']' and self.peek_char() == '#': self.advance() # ] self.advance() # # break self.advance() else: # Single-line comment while self.current_char() not in ['\n', None]: self.advance() def tokenize_number(self) -> Token: """Tokenize integer or float""" start_line = self.line start_column = self.column num_str = '' while self.current_char() and (self.current_char().isdigit() or self.current_char() == '.'): num_str += self.current_char() self.advance() # Check for scientific notation if self.current_char() in ['e', 'E']: num_str += self.current_char() self.advance() if self.current_char() in ['+', '-']: num_str += self.current_char() self.advance() while self.current_char() and self.current_char().isdigit(): num_str += self.current_char() self.advance() if '.' in num_str or 'e' in num_str or 'E' in num_str: return Token(TokenType.FLOAT, float(num_str), start_line, start_column) else: return Token(TokenType.INTEGER, int(num_str), start_line, start_column) def tokenize_string(self) -> Token: """Tokenize string literal""" start_line = self.line start_column = self.column quote_char = self.current_char() self.advance() # Skip opening quote string_value = '' while self.current_char() and self.current_char() != quote_char: if self.current_char() == '\\': self.advance() # Handle escape sequences escape_char = self.current_char() if escape_char == 'n': string_value += '\n' elif escape_char == 't': string_value += '\t' elif escape_char == 'r': string_value += '\r' elif escape_char == '\\': string_value += '\\' elif escape_char == quote_char: string_value += quote_char else: string_value += escape_char self.advance() else: string_value += self.current_char() self.advance() if self.current_char() != quote_char: raise SyntaxError(f"Unterminated string at {start_line}:{start_column}") self.advance() # Skip closing quote return Token(TokenType.STRING, string_value, start_line, start_column) def tokenize_color(self) -> Token: """Tokenize color literal like #FF1D6C""" start_line = self.line start_column = self.column self.advance() # Skip # color_code = '#' while self.current_char() and self.current_char() in '0123456789ABCDEFabcdef': color_code += self.current_char() self.advance() if len(color_code) not in [4, 7, 9]: # #RGB, #RRGGBB, #RRGGBBAA raise SyntaxError(f"Invalid color code {color_code} at {start_line}:{start_column}") return Token(TokenType.COLOR, color_code, start_line, start_column) def tokenize_identifier(self) -> Token: """Tokenize identifier or keyword""" start_line = self.line start_column = self.column identifier = '' while self.current_char() and (self.current_char().isalnum() or self.current_char() == '_'): identifier += self.current_char() self.advance() # Check if it's a keyword token_type = self.KEYWORDS.get(identifier, TokenType.IDENTIFIER) # Special handling for boolean literals if token_type == TokenType.BOOLEAN: value = identifier == 'true' return Token(token_type, value, start_line, start_column) return Token(token_type, identifier, start_line, start_column) def handle_indentation(self, indent_level: int): """Generate INDENT/DEDENT tokens based on indentation""" if indent_level > self.indent_stack[-1]: self.indent_stack.append(indent_level) self.tokens.append(Token(TokenType.INDENT, indent_level, self.line, self.column)) elif indent_level < self.indent_stack[-1]: while self.indent_stack and indent_level < self.indent_stack[-1]: self.indent_stack.pop() self.tokens.append(Token(TokenType.DEDENT, indent_level, self.line, self.column)) if indent_level != self.indent_stack[-1]: raise SyntaxError(f"Inconsistent indentation at {self.line}:{self.column}") def tokenize(self) -> List[Token]: """Main tokenization loop""" while self.pos < len(self.source): # Handle newlines and indentation if self.current_char() == '\n': # Skip empty lines and comments self.advance() # Measure indentation on next line indent_level = 0 while self.current_char() and self.current_char() in ' \t': if self.current_char() == ' ': indent_level += 1 elif self.current_char() == '\t': indent_level += 4 # Tab = 4 spaces self.advance() # Skip blank lines and comment-only lines if self.current_char() == '\n' or self.current_char() == '#': continue # Handle indentation if self.current_char() is not None: self.handle_indentation(indent_level) self.tokens.append(Token(TokenType.NEWLINE, '\n', self.line, self.column)) continue # Skip whitespace (but not newlines) if self.current_char() in ' \t\r': self.skip_whitespace() continue # Skip comments if self.current_char() == '#': # Check if it's a color code if self.peek_char() and self.peek_char() in '0123456789ABCDEFabcdef': self.tokens.append(self.tokenize_color()) else: self.skip_comment() continue # Numbers if self.current_char().isdigit(): self.tokens.append(self.tokenize_number()) continue # Strings if self.current_char() in ['"', "'"]: self.tokens.append(self.tokenize_string()) continue # Identifiers and keywords if self.current_char().isalpha() or self.current_char() == '_': self.tokens.append(self.tokenize_identifier()) continue # Operators and delimiters char = self.current_char() line = self.line col = self.column # Two-character operators if char == '-' and self.peek_char() == '>': self.advance() self.advance() self.tokens.append(Token(TokenType.ARROW, '->', line, col)) continue if char == '.' and self.peek_char() == '.': self.advance() self.advance() if self.current_char() == '.': self.advance() self.tokens.append(Token(TokenType.TRIPLE_DOT, '...', line, col)) else: self.tokens.append(Token(TokenType.DOUBLE_DOT, '..', line, col)) continue if char == '=' and self.peek_char() == '=': self.advance() self.advance() self.tokens.append(Token(TokenType.EQ, '==', line, col)) continue if char == '!' and self.peek_char() == '=': self.advance() self.advance() self.tokens.append(Token(TokenType.NE, '!=', line, col)) continue if char == '<' and self.peek_char() == '=': self.advance() self.advance() self.tokens.append(Token(TokenType.LE, '<=', line, col)) continue if char == '>' and self.peek_char() == '=': self.advance() self.advance() self.tokens.append(Token(TokenType.GE, '>=', line, col)) continue if char == '<' and self.peek_char() == '/': self.advance() self.advance() self.tokens.append(Token(TokenType.LT_SLASH, '': self.advance() self.advance() self.tokens.append(Token(TokenType.SLASH_GT, '/>', line, col)) continue if char == '*' and self.peek_char() == '*': self.advance() self.advance() self.tokens.append(Token(TokenType.POWER, '**', line, col)) continue # Assignment operators if char == '+' and self.peek_char() == '=': self.advance() self.advance() self.tokens.append(Token(TokenType.PLUS_ASSIGN, '+=', line, col)) continue if char == '-' and self.peek_char() == '=': self.advance() self.advance() self.tokens.append(Token(TokenType.MINUS_ASSIGN, '-=', line, col)) continue if char == '*' and self.peek_char() == '=': self.advance() self.advance() self.tokens.append(Token(TokenType.STAR_ASSIGN, '*=', line, col)) continue if char == '/' and self.peek_char() == '=': self.advance() self.advance() self.tokens.append(Token(TokenType.SLASH_ASSIGN, '/=', line, col)) continue # Single-character tokens single_char_tokens = { '+': TokenType.PLUS, '-': TokenType.MINUS, '*': TokenType.STAR, '/': TokenType.SLASH, '%': TokenType.PERCENT, '=': TokenType.ASSIGN, '<': TokenType.LT, '>': TokenType.GT, '(': TokenType.LPAREN, ')': TokenType.RPAREN, '[': TokenType.LBRACKET, ']': TokenType.RBRACKET, '{': TokenType.LBRACE, '}': TokenType.RBRACE, ':': TokenType.COLON, ';': TokenType.SEMICOLON, ',': TokenType.COMMA, '.': TokenType.DOT, '@': TokenType.AT, '?': TokenType.QUESTION, '$': TokenType.DOLLAR, '&': TokenType.AMPERSAND, '|': TokenType.PIPE, '^': TokenType.CARET, '~': TokenType.TILDE, } if char in single_char_tokens: self.advance() self.tokens.append(Token(single_char_tokens[char], char, line, col)) continue # Unknown character raise SyntaxError(f"Unexpected character '{char}' at {line}:{col}") # Handle remaining dedents while len(self.indent_stack) > 1: self.indent_stack.pop() self.tokens.append(Token(TokenType.DEDENT, 0, self.line, self.column)) # Add EOF token self.tokens.append(Token(TokenType.EOF, None, self.line, self.column)) return self.tokens def lex(source: str, filename: str = "") -> List[Token]: """Convenience function to tokenize source code""" lexer = Lexer(source, filename) return lexer.tokenize() # Example usage if __name__ == "__main__": test_code = ''' # This is a test let x: int = 42 let name = "BlackRoad" let color = #FF1D6C fun greet(name: string) -> string: return "Hello, {name}!" space MyScene: cube Box1: position: vec3(0, 0, 0) color: #F5A623 ''' tokens = lex(test_code) for token in tokens: print(token)