Lexer.cs 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517
  1. using System;
  2. using System.Collections;
  3. using System.Collections.Generic;
  4. using System.Globalization;
  5. using System.Linq;
  6. using System.Text;
  7. using System.Text.RegularExpressions;
  8. namespace NTERA.Interpreter.Compiler
  9. {
  10. public class Lexer : IEnumerable<Token>
  11. {
  12. private readonly string source;
  13. private Marker sourceMarker;
  14. private char currentChar;
  15. private readonly IEnumerator<Token> currentEnumerator;
  16. private LexerType _type;
  17. public LexerType Type
  18. {
  19. get => _type;
  20. internal set
  21. {
  22. _type = value;
  23. InitTokenDictionaries();
  24. }
  25. }
  26. public Marker TokenMarker { get; set; }
  27. public string Identifer { get; set; }
  28. public Value Value { get; set; }
  29. public Lexer(string input, LexerType type = LexerType.Both)
  30. {
  31. Type = type;
  32. source = input;
  33. sourceMarker = new Marker(-1, 1, 0);
  34. currentEnumerator = GetTokens();
  35. currentEnumerator.MoveNext();
  36. }
  37. public void GoTo(Marker marker)
  38. {
  39. sourceMarker = marker;
  40. }
  41. char GetNextChar(bool peek = false)
  42. {
  43. if (sourceMarker.Pointer + 1 >= source.Length)
  44. {
  45. sourceMarker.Pointer = source.Length;
  46. return currentChar = (char)0;
  47. }
  48. if (peek)
  49. return currentChar = source[sourceMarker.Pointer + 1];
  50. sourceMarker.Column++;
  51. sourceMarker.Pointer++;
  52. if ((currentChar = source[sourceMarker.Pointer]) == '\n')
  53. {
  54. sourceMarker.Column = 0;
  55. sourceMarker.Line++;
  56. }
  57. return currentChar;
  58. }
  59. protected static Dictionary<string, Token> TokenDictionary;
  60. protected Dictionary<char, Token> TokenCharDictionary;
  61. protected static Dictionary<char, Token> BothModeTokens;
  62. protected static Dictionary<char, Token> StringModeTokens;
  63. private static bool _initialized = false;
  64. private static readonly object _initializedLock = new object();
  65. private void InitTokenDictionaries()
  66. {
  67. if (_initialized)
  68. return;
  69. lock (_initializedLock)
  70. {
  71. if (_initialized)
  72. return;
  73. if (TokenDictionary == null)
  74. {
  75. TokenDictionary = new Dictionary<string, Token>(StringComparer.InvariantCultureIgnoreCase);
  76. foreach (Token token in Enum.GetValues(typeof(Token)))
  77. {
  78. foreach (var attribute in Utility.GetEnumAttributes<Token, LexerKeywordAttribute>(token))
  79. {
  80. TokenDictionary[attribute.Keyword] = token;
  81. }
  82. }
  83. }
  84. if (BothModeTokens == null || StringModeTokens == null)
  85. {
  86. BothModeTokens = new Dictionary<char, Token>();
  87. StringModeTokens = new Dictionary<char, Token>();
  88. foreach (Token token in Enum.GetValues(typeof(Token)))
  89. {
  90. foreach (var attribute in Utility.GetEnumAttributes<Token, LexerCharacterAttribute>(token))
  91. {
  92. if ((attribute.LexerContext & LexerType.String) > 0)
  93. StringModeTokens[attribute.Character] = token;
  94. BothModeTokens[attribute.Character] = token;
  95. }
  96. }
  97. }
  98. TokenCharDictionary = Type == LexerType.String ? StringModeTokens : BothModeTokens;
  99. }
  100. }
  101. private static Regex PowRegex = new Regex(@"(\d+)p(\d+)");
  102. private static bool IsWhitespace(char c)
  103. {
  104. return char.IsWhiteSpace(c) && c != '\n';
  105. }
  106. private static bool IsEndOfLine(char c)
  107. {
  108. return c == '\n' || c == '\r' || c == '\0';
  109. }
  110. private Token DetermineToken(bool peek, bool useCurrent)
  111. {
  112. char c = useCurrent ? currentChar : GetNextChar(peek);
  113. if (TokenCharDictionary.TryGetValue(c, out Token charToken))
  114. return charToken;
  115. switch (c)
  116. {
  117. case ';': //semicolon is comment
  118. while (currentChar != '\n')
  119. {
  120. if (currentChar == '\0')
  121. return Token.EOF;
  122. GetNextChar();
  123. }
  124. return Token.NewLine;
  125. case '[':
  126. const string SkipStart = "[SKIPSTART]";
  127. const string SkipEnd = "[SKIPEND]";
  128. if (sourceMarker.Column > 1
  129. || source.Substring(sourceMarker.Pointer, SkipStart.Length) != SkipStart)
  130. return Token.Unknown;
  131. while (GetNextChar() != '\0')
  132. {
  133. if (currentChar == '[' && source.Substring(sourceMarker.Pointer, SkipEnd.Length) == SkipEnd)
  134. {
  135. while (true)
  136. {
  137. switch (GetNextChar())
  138. {
  139. case '\n':
  140. return Token.NewLine;
  141. case '\0':
  142. return Token.EOF;
  143. }
  144. }
  145. }
  146. }
  147. return Token.EOF;
  148. case '%':
  149. return Type == LexerType.String ? Token.Format : Token.Modulo;
  150. case '<':
  151. if (!Type.HasFlag(LexerType.Real))
  152. break;
  153. if (GetNextChar(true) == '>')
  154. {
  155. GetNextChar();
  156. return Token.NotEqual;
  157. }
  158. else if (GetNextChar(true) == '=')
  159. {
  160. GetNextChar();
  161. return Token.LessEqual;
  162. }
  163. else
  164. return Token.Less;
  165. case '>':
  166. if (!Type.HasFlag(LexerType.Real))
  167. break;
  168. if (GetNextChar(true) == '=')
  169. {
  170. GetNextChar();
  171. return Token.MoreEqual;
  172. }
  173. else
  174. return Token.More;
  175. case '+':
  176. if (Type == LexerType.String)
  177. return Token.Unknown;
  178. if (peek)
  179. GetNextChar();
  180. if (GetNextChar(true) == '+')
  181. {
  182. GetNextChar();
  183. return Token.Increment;
  184. }
  185. else
  186. return Token.Plus;
  187. case '-':
  188. if (Type == LexerType.String)
  189. return Token.Unknown;
  190. if (peek)
  191. GetNextChar();
  192. if (GetNextChar(true) == '-')
  193. {
  194. GetNextChar();
  195. return Token.Decrement;
  196. }
  197. else
  198. return Token.Minus;
  199. case '=':
  200. if (Type == LexerType.String)
  201. return Token.Unknown;
  202. if (peek)
  203. GetNextChar();
  204. if (GetNextChar(true) == '=')
  205. GetNextChar();
  206. return Token.Equal;
  207. case '&':
  208. if (peek)
  209. GetNextChar();
  210. if (GetNextChar(true) == '&')
  211. GetNextChar();
  212. return Token.And;
  213. case '|':
  214. if (peek)
  215. GetNextChar();
  216. if (GetNextChar(true) == '|')
  217. GetNextChar();
  218. return Token.Or;
  219. case '@':
  220. if (Type == LexerType.String)
  221. return Token.Unknown;
  222. if (GetNextChar(true) == '"')
  223. {
  224. GetNextChar();
  225. goto case '"';
  226. }
  227. return Token.AtSymbol;
  228. case '"':
  229. //if (peek)
  230. // GetNextChar();
  231. string str = "";
  232. while (GetNextChar() != '"')
  233. {
  234. if (currentChar == '\\')
  235. {
  236. switch (char.ToLower(GetNextChar()))
  237. {
  238. case 'n':
  239. str += '\n';
  240. break;
  241. case 't':
  242. str += '\t';
  243. break;
  244. case '\\':
  245. str += '\\';
  246. break;
  247. case '"':
  248. str += '"';
  249. break;
  250. }
  251. }
  252. else if (currentChar == '\0')
  253. throw new ParserException("Unexpected end of file");
  254. else
  255. {
  256. str += currentChar;
  257. }
  258. }
  259. Value = new Value(str);
  260. return Token.Value;
  261. case (char)0:
  262. return Token.EOF;
  263. }
  264. return Token.Unknown;
  265. }
  266. private IEnumerator<Token> GetTokens()
  267. {
  268. sourceMarker = new Marker(-1, 1, 0);
  269. while (true)
  270. {
  271. while (IsWhitespace(GetNextChar()) && Type != LexerType.String || currentChar == '\r')
  272. {
  273. }
  274. TokenMarker = sourceMarker;
  275. Token token = DetermineToken(false, true);
  276. if (token == Token.EOF)
  277. {
  278. yield return Token.EOF;
  279. yield break;
  280. }
  281. if (token != Token.Unknown)
  282. {
  283. yield return token;
  284. continue;
  285. }
  286. StringBuilder bodyBuilder = new StringBuilder(currentChar.ToString());
  287. while (DetermineToken(true, false) == Token.Unknown
  288. && (!IsWhitespace(GetNextChar(true)) || Type == LexerType.String)
  289. && GetNextChar(true) != '\r')
  290. {
  291. bodyBuilder.Append(GetNextChar());
  292. }
  293. string result = bodyBuilder.ToString();
  294. if (double.TryParse(result, NumberStyles.Float, CultureInfo.InvariantCulture, out var real))
  295. {
  296. Value = real;
  297. yield return Token.Value;
  298. continue;
  299. }
  300. if (result.StartsWith("0x") && int.TryParse(result.Replace("0x", ""), NumberStyles.HexNumber, CultureInfo.CurrentCulture, out int hexResult))
  301. {
  302. Value = hexResult;
  303. yield return Token.Value;
  304. continue;
  305. }
  306. Match powMatch = PowRegex.Match(result);
  307. if (powMatch.Success)
  308. {
  309. int a = int.Parse(powMatch.Groups[1].Value);
  310. int b = int.Parse(powMatch.Groups[2].Value);
  311. Value = a << b;
  312. yield return Token.Value;
  313. continue;
  314. }
  315. Identifer = bodyBuilder.ToString();
  316. if (TokenDictionary.TryGetValue(Identifer, out token))
  317. {
  318. yield return token;
  319. continue;
  320. }
  321. if (Type == LexerType.String)
  322. {
  323. Value = char.IsWhiteSpace(Identifer[0])
  324. ? Identifer.Substring(1)
  325. : Identifer;
  326. yield return Token.Value;
  327. continue;
  328. }
  329. yield return Token.Identifer;
  330. if (currentChar == '\n')
  331. yield return Token.NewLine;
  332. }
  333. }
  334. public IEnumerator<Token> GetEnumerator()
  335. {
  336. return currentEnumerator;
  337. }
  338. IEnumerator IEnumerable.GetEnumerator()
  339. {
  340. return GetEnumerator();
  341. }
  342. private static readonly Dictionary<Token, int> OrderOfOps = new Dictionary<Token, int>
  343. {
  344. { Token.Or, 0 }, { Token.And, 0 },
  345. { Token.Equal, 1 }, { Token.NotEqual, 1 },
  346. { Token.Less, 1 }, { Token.More, 1 }, { Token.LessEqual, 1 }, { Token.MoreEqual, 1 },
  347. { Token.Plus, 2 }, { Token.Minus, 2 },
  348. { Token.Asterisk, 3 }, { Token.Slash, 3 },
  349. { Token.Caret, 4 }
  350. };
  351. public Value Expression()
  352. {
  353. Stack<Value> stack = new Stack<Value>();
  354. Stack<Token> operators = new Stack<Token>();
  355. void Operation(Token token)
  356. {
  357. Value b = stack.Pop();
  358. Value a = stack.Pop();
  359. Value result = a.Operate(b, token);
  360. stack.Push(result);
  361. }
  362. int i = 0;
  363. while (true)
  364. {
  365. if (currentEnumerator.Current == Token.Value)
  366. {
  367. stack.Push(Value);
  368. }
  369. else if (currentEnumerator.Current == Token.Identifer)
  370. {
  371. if (Type == LexerType.String)
  372. stack.Push(Identifer);
  373. else
  374. throw new ParserException("Undeclared variable " + Identifer, TokenMarker);
  375. }
  376. else if (currentEnumerator.Current == Token.LParen)
  377. {
  378. currentEnumerator.MoveNext();
  379. stack.Push(Expression());
  380. if (currentEnumerator.Current != Token.RParen)
  381. throw new ParserException($"Was expecting [LParen] got [{currentEnumerator.Current}]", TokenMarker);
  382. }
  383. else if (Type.HasFlag(LexerType.Real) && currentEnumerator.Current.IsArithmetic()
  384. && currentEnumerator.Current.IsUnary() && (i == 0)) // || previousToken == Token.LParen))
  385. {
  386. stack.Push(0);
  387. operators.Push(currentEnumerator.Current);
  388. }
  389. else if (Type == LexerType.String && currentEnumerator.Current.IsStringOp()
  390. || Type.HasFlag(LexerType.Real) && currentEnumerator.Current.IsArithmetic())
  391. {
  392. while (operators.Count > 0 && OrderOfOps[currentEnumerator.Current] <= OrderOfOps[operators.Peek()])
  393. Operation(operators.Pop());
  394. operators.Push(currentEnumerator.Current);
  395. }
  396. else
  397. {
  398. if (i == 0)
  399. {
  400. if (Type == LexerType.String)
  401. stack.Push("");
  402. else
  403. throw new ParserException("Empty expression", TokenMarker);
  404. }
  405. break;
  406. }
  407. i++;
  408. currentEnumerator.MoveNext();
  409. }
  410. while (operators.Count > 0)
  411. Operation(operators.Pop());
  412. return Type == LexerType.String
  413. ? stack.Aggregate((a, b) => b.String + a.String)
  414. : stack.Pop();
  415. }
  416. }
  417. }