After doing some initial testing, the boot-strap capable parser seems to be able to best OILexer itself in parsing its own language. When I started this project back in 2008, the experience I had writing parsers was limited at best. The new mechanics put in place do things I can only hope to do by hand: unbound look-ahead. Granted the number of tokens seems to vary, but in cases where the look-ahead requires a loop, it seems to handle it just fine. Some initial testing, comparing the two file sets, shows the automated parser is quite fast when compared to its hand-written brethren:
Hand Built Parser:
Automated Parser
The paths for the hand-built parser are longer because the files are copied to the output directory for the project. I've blurred the full path to avoid giving too much about my personal system.
I'm rather shocked that there are cases where it is as much as a hundred and fifty times faster. These are -running in the debugger- figures (64-bit target), so the release times may vary. If anything I think the automated parser will just get faster.
There's a few things to note: the automated parser doesn't handle errors. It can detect them, but as far as saying where they are after the fact, it's a no-go. It's worth noting that OILexer's hand-written parser likely has a few areas where it will still hit an 'infinite loop', as I've encountered one or two in the live-edit mode within Visual Studio during its development (ok, more than one or two.) These have been patched along the way, but they're usually due to unexpected input and the fact that the hand-written parser does some wonky peeking at the lexical view to avoid needing to parse certain characters, which... sometimes don't even have valid tokens!
I suspect the automated build will slow down some once Error Handling is introduced. I'm getting ready to 'break stuff' in light of issues with Indirect Left Recursion that I think I have a logical plan of action for, but it's not a simple fix. More on that later (once I have figured it out :)
It's interesting to me to compare the automated version's 'prediction' versus the hand-written variant.
Automated Determination:private int _Predict14OnTokenItem(int laDepth) { int exitLADepth = -1; int currentLADepth = laDepth; int exitState = -1; this._state = 673; PredictMultiTargetState_673: switch (this._symbolStream.LookAhead(currentLADepth)) { case OilexerParserSymbols.BuiltInLanguageCommands_BaseEncode: case OilexerParserSymbols.BuiltInLanguageCommands_Scan: case OilexerParserSymbols.BuiltInLanguageCommands_Subtract: exitState = this._state = 681; exitLADepth = ++currentLADepth; goto DecisionPoint; case OilexerParserSymbols.CharacterRange: exitState = this._state = 680; exitLADepth = ++currentLADepth; goto DecisionPoint; case OilexerParserSymbols.LanguageOperatorToken_BeginGroup: exitState = this._state = 679; exitLADepth = ++currentLADepth; goto DecisionPoint; case OilexerParserSymbols.Char: case OilexerParserSymbols.String: exitState = this._state = 678; exitLADepth = ++currentLADepth; goto DecisionPoint; case OilexerParserSymbols.Identifier: exitState = this._state = 677; exitLADepth = ++currentLADepth; goto DecisionPoint; case OilexerParserSymbols.Ambiguity1: case OilexerParserSymbols.Ambiguity2: case OilexerParserSymbols.Ambiguity3: exitState = this._state = 674; exitLADepth = ++currentLADepth; switch (this._symbolStream.LookAhead(currentLADepth)) { case OilexerParserSymbols.LanguageOperatorToken_BeginGroup: exitState = this._state = 676; exitLADepth = ++currentLADepth; goto PredictMultiTargetState_676; case OilexerParserSymbols.LanguageOperatorToken_IdentitySeparator: case OilexerParserSymbols.LanguageOperatorToken_ItemIsRecognizer: case OilexerParserSymbols.LanguageOperatorToken_OneOrMore: case OilexerParserSymbols.LanguageOperatorToken_OptionsSeparator: case OilexerParserSymbols.LanguageOperatorToken_RepetitionRangeStart: case OilexerParserSymbols.LanguageOperatorToken_ZeroOrMore: case OilexerParserSymbols.LanguageOperatorToken_ZeroOrOne: exitState = this._state = 675; exitLADepth = ++currentLADepth; goto PredictMultiTargetState_675; default: goto DecisionPoint; } default: goto DecisionPoint; } PredictMultiTargetState_676: goto DecisionPoint; PredictMultiTargetState_675: goto DecisionPoint; DecisionPoint: switch (exitState) { case 674: case 675: case 677: return 548 /* SoftReferenceScannableEntryItem */; case 676: case 681: return 546 /* BuiltInCommandTokenItem */; case 678: return 549 /* LiteralTokenItem */; case 679: return 547 /* GroupTokenItem */; case 680: return 550 /* CharacterRangeTokenItem */; } return 0 /* The prediction did not match a valid alt. */; }Hand-written variant, doing prediction within the TokenExpression that uses the TokenItems
private void ParseTokenBody(List<ITokenExpression> expressions) { int l = this.CurrentTokenizer.GetLineIndex(this.StreamPosition); int ci = this.CurrentTokenizer.GetColumnIndex(this.StreamPosition); long p = this.StreamPosition; List<ITokenItem> seriesItemMembers = new List<ITokenItem>(); while (true) { ITokenItem item = null; IOilexerGrammarToken igdt = LookAhead(0); if (igdt == null && CurrentTokenizer.CurrentError != null) { currentTarget.SyntaxErrors.SyntaxError(CurrentTokenizer.CurrentError); break; } else { if (igdt == null) { LogError(OilexerGrammarParserErrors.Expected, "identifier, '(' or ';'"); return; } switch (igdt.TokenType) { case OilexerGrammarTokenType.Comment: var commentToken = (OilexerGrammarTokens.CommentToken)igdt; if (captureRegions && commentToken.MultiLine) { var gdResult = (OilexerGrammarFile)currentTarget.Result; gdResult.AddCommentRegion(commentToken); } PopAhead(); continue; case OilexerGrammarTokenType.CharacterLiteral: item = ParseTokenCharLiteral(); break; case OilexerGrammarTokenType.StringLiteral: item = ParseTokenStringLiteral(); break; case OilexerGrammarTokenType.Identifier: item = ParseTokenReferenceItem(); break; case OilexerGrammarTokenType.CharacterRange: item = ParseTokenCharacterRange(); break; case OilexerGrammarTokenType.NumberLiteral: LogError(OilexerGrammarParserErrors.Unexpected, "number", igdt.Position); goto yield; case OilexerGrammarTokenType.PreprocessorDirective: LogError(OilexerGrammarParserErrors.Unexpected, "preprocessor", igdt.Position); goto yield; case OilexerGrammarTokenType.Operator: switch (((OilexerGrammarTokens.OperatorToken)(igdt)).Type) { case OilexerGrammarTokens.OperatorType.Pipe: ClearAhead(); goto yield; case OilexerGrammarTokens.OperatorType.LeftParenthesis: ClearAhead(); item = (ITokenGroupItem)ParseTokenGroup(); break; case OilexerGrammarTokens.OperatorType.RightParenthesis: if (parameterDepth <= 0) LogError(OilexerGrammarParserErrors.Expected, ";", igdt.Position); goto yield; case OilexerGrammarTokens.OperatorType.GreaterThan: case OilexerGrammarTokens.OperatorType.Comma: if (commandDepths.Contains(parameterDepth)) { ClearAhead(); goto yield; } PopAhead(); LogError(OilexerGrammarParserErrors.Expected, "identifier, '(', or ';'"); PopAhead(); goto yield; case OilexerGrammarTokens.OperatorType.SemiColon: ClearAhead(); if (templateDepth != 0 || parameterDepth != 0) { Expect((templateDepth == 0 ? ">" : string.Empty) + (parameterDepth > 0 ? ")" : string.Empty), this.StreamPosition); } goto yield; default: goto default_2; } break; default: default_2: LogError(OilexerGrammarParserErrors.Expected, "identifier, '(' or ';'"); PopAhead(); break; } } if (item != null) { ClearAhead(); if (TokenizerLookAhead(0) == ':') { ParseItemOptions(item); } ParseItemRepeatOptions(item); seriesItemMembers.Add(item); } } yield: expressions.Add(new TokenExpression(seriesItemMembers, CurrentTokenizer.FileName, l, ci, p)); }
No comments:
Post a Comment