Lexer.cs 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533
  1. using System;
  2. using System.Collections;
  3. using System.Collections.Generic;
  4. using System.Globalization;
  5. using System.Linq;
  6. using System.Text;
  7. using System.Text.RegularExpressions;
  8. namespace NTERA.Interpreter.Compiler
  9. {
  10. public class Lexer : IEnumerable<Token>
  11. {
  12. private readonly string source;
  13. private Marker sourceMarker;
  14. private char currentChar;
  15. private readonly IEnumerator<Token> currentEnumerator;
  16. private LexerType _type;
  17. public LexerType Type
  18. {
  19. get => _type;
  20. internal set
  21. {
  22. _type = value;
  23. InitTokenDictionaries();
  24. }
  25. }
  26. public Marker TokenMarker { get; set; }
  27. public string Identifer { get; set; }
  28. public Value Value { get; set; }
  29. public Lexer(string input, LexerType type = LexerType.Both)
  30. {
  31. Type = type;
  32. source = input;
  33. sourceMarker = new Marker(-1, 1, 0);
  34. currentEnumerator = GetTokens();
  35. currentEnumerator.MoveNext();
  36. }
  37. public void GoTo(Marker marker)
  38. {
  39. sourceMarker = marker;
  40. }
  41. char GetNextChar(bool peek = false)
  42. {
  43. if (sourceMarker.Pointer + 1 >= source.Length)
  44. {
  45. sourceMarker.Pointer = source.Length;
  46. return currentChar = (char)0;
  47. }
  48. if (peek)
  49. return currentChar = source[sourceMarker.Pointer + 1];
  50. sourceMarker.Column++;
  51. sourceMarker.Pointer++;
  52. if ((currentChar = source[sourceMarker.Pointer]) == '\n')
  53. {
  54. sourceMarker.Column = 0;
  55. sourceMarker.Line++;
  56. }
  57. return currentChar;
  58. }
  59. private static Dictionary<string, Token> TokenDictionary;
  60. private static Dictionary<string, Token> TokenLineDictionary;
  61. private Dictionary<char, Token> TokenCharDictionary;
  62. private static Dictionary<char, Token> BothModeTokens;
  63. private static Dictionary<char, Token> StringModeTokens;
  64. private void InitTokenDictionaries()
  65. {
  66. if (TokenDictionary == null || TokenLineDictionary == null)
  67. {
  68. TokenDictionary = new Dictionary<string, Token>(StringComparer.InvariantCultureIgnoreCase);
  69. TokenLineDictionary = new Dictionary<string, Token>(StringComparer.InvariantCultureIgnoreCase);
  70. foreach (Token token in Enum.GetValues(typeof(Token)))
  71. {
  72. foreach (var attribute in Utility.GetEnumAttributes<Token, LexerKeywordAttribute>(token))
  73. {
  74. if (attribute.IsLineKeyword)
  75. TokenLineDictionary[attribute.Keyword] = token;
  76. else
  77. TokenDictionary[attribute.Keyword] = token;
  78. }
  79. }
  80. }
  81. if (BothModeTokens == null || StringModeTokens == null)
  82. {
  83. BothModeTokens = new Dictionary<char, Token>();
  84. StringModeTokens = new Dictionary<char, Token>();
  85. foreach (Token token in Enum.GetValues(typeof(Token)))
  86. {
  87. foreach (var attribute in Utility.GetEnumAttributes<Token, LexerCharacterAttribute>(token))
  88. {
  89. if ((attribute.LexerContext & LexerType.String) > 0)
  90. StringModeTokens[attribute.Character] = token;
  91. BothModeTokens[attribute.Character] = token;
  92. }
  93. }
  94. }
  95. TokenCharDictionary = Type == LexerType.String ? StringModeTokens : BothModeTokens;
  96. }
  97. private static Regex PowRegex = new Regex(@"(\d+)p(\d+)");
  98. private static bool IsWhitespace(char c)
  99. {
  100. return char.IsWhiteSpace(c) && c != '\n';
  101. }
  102. private static bool IsEndOfLine(char c)
  103. {
  104. return c == '\n' || c == '\r' || c == '\0';
  105. }
  106. private Token DetermineToken(bool peek, bool useCurrent)
  107. {
  108. char c = useCurrent ? currentChar : GetNextChar(peek);
  109. if (TokenCharDictionary.TryGetValue(c, out Token charToken))
  110. return charToken;
  111. switch (c)
  112. {
  113. case ';': //semicolon is comment
  114. while (currentChar != '\n')
  115. {
  116. if (currentChar == '\0')
  117. return Token.EOF;
  118. GetNextChar();
  119. }
  120. return Token.NewLine;
  121. case '[':
  122. const string SkipStart = "[SKIPSTART]";
  123. const string SkipEnd = "[SKIPEND]";
  124. if (sourceMarker.Column > 1
  125. || source.Substring(sourceMarker.Pointer, SkipStart.Length) != SkipStart)
  126. return Token.Unknown;
  127. while (GetNextChar() != '\0')
  128. {
  129. if (currentChar == '[' && source.Substring(sourceMarker.Pointer, SkipEnd.Length) == SkipEnd)
  130. {
  131. while (true)
  132. {
  133. switch (GetNextChar())
  134. {
  135. case '\n':
  136. return Token.NewLine;
  137. case '\0':
  138. return Token.EOF;
  139. }
  140. }
  141. }
  142. }
  143. return Token.EOF;
  144. case '%':
  145. return Type == LexerType.String ? Token.Format : Token.Modulo;
  146. case '<':
  147. if (!Type.HasFlag(LexerType.Real))
  148. break;
  149. if (GetNextChar(true) == '>')
  150. {
  151. GetNextChar();
  152. return Token.NotEqual;
  153. }
  154. else if (GetNextChar(true) == '=')
  155. {
  156. GetNextChar();
  157. return Token.LessEqual;
  158. }
  159. else
  160. return Token.Less;
  161. case '>':
  162. if (!Type.HasFlag(LexerType.Real))
  163. break;
  164. if (GetNextChar(true) == '=')
  165. {
  166. GetNextChar();
  167. return Token.MoreEqual;
  168. }
  169. else
  170. return Token.More;
  171. case '+':
  172. if (Type == LexerType.String)
  173. return Token.Unknown;
  174. if (peek)
  175. GetNextChar();
  176. if (GetNextChar(true) == '+')
  177. {
  178. GetNextChar();
  179. return Token.Increment;
  180. }
  181. else
  182. return Token.Plus;
  183. case '-':
  184. if (Type == LexerType.String)
  185. return Token.Unknown;
  186. if (peek)
  187. GetNextChar();
  188. if (GetNextChar(true) == '-')
  189. {
  190. GetNextChar();
  191. return Token.Decrement;
  192. }
  193. else
  194. return Token.Minus;
  195. case '=':
  196. if (Type == LexerType.String)
  197. return Token.Unknown;
  198. if (peek)
  199. GetNextChar();
  200. if (GetNextChar(true) == '=')
  201. GetNextChar();
  202. return Token.Equal;
  203. case '&':
  204. if (peek)
  205. GetNextChar();
  206. if (GetNextChar(true) == '&')
  207. GetNextChar();
  208. return Token.And;
  209. case '|':
  210. if (peek)
  211. GetNextChar();
  212. if (GetNextChar(true) == '|')
  213. GetNextChar();
  214. return Token.Or;
  215. case '@':
  216. if (Type == LexerType.String)
  217. return Token.Unknown;
  218. if (GetNextChar(true) == '"')
  219. {
  220. GetNextChar();
  221. goto case '"';
  222. }
  223. return Token.Function;
  224. case '"':
  225. //if (peek)
  226. // GetNextChar();
  227. string str = "";
  228. while (GetNextChar() != '"')
  229. {
  230. if (currentChar == '\\')
  231. {
  232. switch (char.ToLower(GetNextChar()))
  233. {
  234. case 'n':
  235. str += '\n';
  236. break;
  237. case 't':
  238. str += '\t';
  239. break;
  240. case '\\':
  241. str += '\\';
  242. break;
  243. case '"':
  244. str += '"';
  245. break;
  246. }
  247. }
  248. else if (currentChar == '\0')
  249. throw new ParserException("Unexpected end of file");
  250. else
  251. {
  252. str += currentChar;
  253. }
  254. }
  255. Value = new Value(str);
  256. return Token.Value;
  257. case (char)0:
  258. return Token.EOF;
  259. }
  260. return Token.Unknown;
  261. }
  262. private IEnumerator<Token> GetTokens()
  263. {
  264. sourceMarker = new Marker(-1, 1, 0);
  265. while (true)
  266. {
  267. while (IsWhitespace(GetNextChar()) && Type != LexerType.String || currentChar == '\r')
  268. {
  269. }
  270. TokenMarker = sourceMarker;
  271. Token token = DetermineToken(false, true);
  272. if (token == Token.EOF)
  273. {
  274. yield return Token.EOF;
  275. yield break;
  276. }
  277. if (token != Token.Unknown)
  278. {
  279. yield return token;
  280. continue;
  281. }
  282. StringBuilder bodyBuilder = new StringBuilder(currentChar.ToString());
  283. while (DetermineToken(true, false) == Token.Unknown
  284. && (!IsWhitespace(GetNextChar(true)) || Type == LexerType.String)
  285. && GetNextChar(true) != '\r')
  286. {
  287. bodyBuilder.Append(GetNextChar());
  288. }
  289. string result = bodyBuilder.ToString();
  290. if (double.TryParse(result, NumberStyles.Float, CultureInfo.InvariantCulture, out var real))
  291. {
  292. Value = real;
  293. yield return Token.Value;
  294. continue;
  295. }
  296. if (result.StartsWith("0x") && int.TryParse(result.Replace("0x", ""), NumberStyles.HexNumber, CultureInfo.CurrentCulture, out int hexResult))
  297. {
  298. Value = hexResult;
  299. yield return Token.Value;
  300. continue;
  301. }
  302. Match powMatch = PowRegex.Match(result);
  303. if (powMatch.Success)
  304. {
  305. int a = int.Parse(powMatch.Groups[1].Value);
  306. int b = int.Parse(powMatch.Groups[2].Value);
  307. Value = a << b;
  308. yield return Token.Value;
  309. continue;
  310. }
  311. Identifer = bodyBuilder.ToString();
  312. if (TokenDictionary.TryGetValue(Identifer, out token))
  313. {
  314. yield return token;
  315. continue;
  316. }
  317. if (Type == LexerType.String)
  318. {
  319. Value = char.IsWhiteSpace(Identifer[0])
  320. ? Identifer.Substring(1)
  321. : Identifer;
  322. yield return Token.Value;
  323. continue;
  324. }
  325. if (TokenLineDictionary.TryGetValue(Identifer, out token))
  326. {
  327. bodyBuilder = new StringBuilder();
  328. while (!IsEndOfLine(GetNextChar(true)))
  329. bodyBuilder.Append(GetNextChar());
  330. yield return token;
  331. string strValue = bodyBuilder.ToString();
  332. if (strValue.Length > 0 && char.IsWhiteSpace(strValue[0]))
  333. strValue = strValue.Substring(1);
  334. Value = new Value(strValue);
  335. yield return Token.Value;
  336. yield return currentChar == '\0' ? Token.EOF : Token.NewLine;
  337. continue;
  338. }
  339. yield return Token.Identifer;
  340. if (currentChar == '\n')
  341. yield return Token.NewLine;
  342. }
  343. }
  344. public IEnumerator<Token> GetEnumerator()
  345. {
  346. return currentEnumerator;
  347. }
  348. IEnumerator IEnumerable.GetEnumerator()
  349. {
  350. return GetEnumerator();
  351. }
  352. private static readonly Dictionary<Token, int> OrderOfOps = new Dictionary<Token, int>
  353. {
  354. { Token.Or, 0 }, { Token.And, 0 },
  355. { Token.Equal, 1 }, { Token.NotEqual, 1 },
  356. { Token.Less, 1 }, { Token.More, 1 }, { Token.LessEqual, 1 }, { Token.MoreEqual, 1 },
  357. { Token.Plus, 2 }, { Token.Minus, 2 },
  358. { Token.Asterisk, 3 }, { Token.Slash, 3 },
  359. { Token.Caret, 4 }
  360. };
  361. public Value Expression()
  362. {
  363. Stack<Value> stack = new Stack<Value>();
  364. Stack<Token> operators = new Stack<Token>();
  365. void Operation(Token token)
  366. {
  367. Value b = stack.Pop();
  368. Value a = stack.Pop();
  369. Value result = a.Operate(b, token);
  370. stack.Push(result);
  371. }
  372. int i = 0;
  373. while (true)
  374. {
  375. if (currentEnumerator.Current == Token.Value)
  376. {
  377. stack.Push(Value);
  378. }
  379. else if (currentEnumerator.Current == Token.Identifer)
  380. {
  381. if (Type == LexerType.String)
  382. stack.Push(Identifer);
  383. else
  384. throw new ParserException("Undeclared variable " + Identifer, TokenMarker);
  385. }
  386. else if (currentEnumerator.Current == Token.LParen)
  387. {
  388. currentEnumerator.MoveNext();
  389. stack.Push(Expression());
  390. if (currentEnumerator.Current != Token.RParen)
  391. throw new ParserException($"Was expecting [LParen] got [{currentEnumerator.Current}]", TokenMarker);
  392. }
  393. else if (Type.HasFlag(LexerType.Real) && currentEnumerator.Current.IsArithmetic()
  394. && currentEnumerator.Current.IsUnary() && (i == 0)) // || previousToken == Token.LParen))
  395. {
  396. stack.Push(0);
  397. operators.Push(currentEnumerator.Current);
  398. }
  399. else if (Type == LexerType.String && currentEnumerator.Current.IsStringOp()
  400. || Type.HasFlag(LexerType.Real) && currentEnumerator.Current.IsArithmetic())
  401. {
  402. while (operators.Count > 0 && OrderOfOps[currentEnumerator.Current] <= OrderOfOps[operators.Peek()])
  403. Operation(operators.Pop());
  404. operators.Push(currentEnumerator.Current);
  405. }
  406. else
  407. {
  408. if (i == 0)
  409. {
  410. if (Type == LexerType.String)
  411. stack.Push("");
  412. else
  413. throw new ParserException("Empty expression", TokenMarker);
  414. }
  415. break;
  416. }
  417. i++;
  418. currentEnumerator.MoveNext();
  419. }
  420. while (operators.Count > 0)
  421. Operation(operators.Pop());
  422. return Type == LexerType.String
  423. ? stack.Aggregate((a, b) => b.String + a.String)
  424. : stack.Pop();
  425. }
  426. }
  427. }