/// Dedicated to the public domain by Christopher Diggins
/// http://creativecommons.org/licenses/publicdomain/
using System;
using System.Collections.Generic;
using System.Text;
using System.Diagnostics;
namespace Peg
{
///
/// Takes a string as input and constructs an abstract syntax tree.
///
public class Parser
{
int mIndex;
string mData;
PegAstNode mTree;
PegAstNode mCur;
public Parser(string s)
{
mIndex = 0;
mData = s;
mTree = new PegAstNode(0, mData, null, null);
mCur = mTree;
}
public bool AtEnd()
{
return mIndex >= mData.Length;
}
public int GetPos()
{
return mIndex;
}
public string CurrentLine
{
get
{
return mData.Substring(mIndex, 20);
}
}
public string ParserPosition
{
get
{
string ret = "";
int nLine = 0;
int nLastLineChar = 0;
for (int i = 0; i < mIndex; ++i)
{
if (mData[i].Equals('\n'))
{
nLine++;
nLastLineChar = i;
}
}
int nCol = mIndex - nLastLineChar;
ret += "CatProgram " + nLine.ToString() + ", Column " + nCol + "\n";
int nNextLine = mIndex;
while (nNextLine < mData.Length && !mData[nNextLine].Equals('\n'))
nNextLine++;
ret += mData.Substring(nLastLineChar, nNextLine - nLastLineChar) + "\n";
ret += new String(' ', nCol);
ret += "^";
return ret;
}
}
public void SetPos(int pos)
{
mIndex = pos;
}
public void GotoNext()
{
if (AtEnd())
{
throw new Exception("passed the end of input");
}
mIndex++;
}
public char GetChar()
{
if (AtEnd())
{
throw new Exception("passed end of input");
}
return mData[mIndex];
}
public PegAstNode CreateNode(Object label)
{
Trace.Assert(mCur != null);
mCur = mCur.Add(this, label);
Trace.Assert(mCur != null);
return mCur;
}
public void AbandonNode()
{
Trace.Assert(mCur != null);
PegAstNode tmp = mCur;
mCur = mCur.GetParent();
Trace.Assert(mCur != null);
mCur.Remove(tmp);
}
public void CompleteNode()
{
Trace.Assert(mCur != null);
mCur.Complete(this);
mCur = mCur.GetParent();
Trace.Assert(mCur != null);
}
public PegAstNode GetAst()
{
return mTree;
}
public bool Parse(Grammar.Rule rule)
{
bool b = false;
b = rule.Match(this);
if (b)
{
if (mCur != mTree)
throw new Exception("internal error: parse tree and parse node do not match after parsing");
mCur.Complete(this);
}
return b;
}
}
///
/// A node in an abstract syntax tree.
///
public class PegAstNode
{
int mnBegin;
int mnCount;
Object mLabel;
String msText;
PegAstNode mpParent;
List mChildren = new List();
public PegAstNode(int n, String text, PegAstNode p, Object label)
{
msText = text;
mnBegin = n;
mnCount = -1;
mpParent = p;
mLabel = label;
}
public PegAstNode Add(Parser p, Object label)
{
PegAstNode ret = new PegAstNode(p.GetPos(), msText, this, label);
mChildren.Add(ret);
return ret;
}
public void Complete(Parser p)
{
mnCount = p.GetPos() - mnBegin;
}
public PegAstNode GetParent()
{
return mpParent;
}
public void Remove(PegAstNode x)
{
mChildren.Remove(x);
}
public override string ToString()
{
return msText.Substring(mnBegin, mnCount);
}
public List GetChildren()
{
return mChildren;
}
public int GetNumChildren()
{
return mChildren.Count;
}
public PegAstNode GetChild(int n)
{
return mChildren[n];
}
public Object GetLabel()
{
return mLabel;
}
}
///
/// A grammar is a set of rules which define a language. Grammar rules in the context
/// of a recursive descent parsing library correspond to pattern matches which are also known
/// somewhat confusingly as "parsers".
///
public class Grammar
{
///
/// Used with RuleDelay to make cicrcular rule references
///
///
public delegate Rule RuleDelegate();
///
/// A rule class corresponds a PEG grammar production rule.
/// A production rule describes how to generate valid syntactic
/// phrases in a programming language. A production rule also
/// corresponds to a pattern matcher in a recursive-descent parser.
///
/// Each instance of a Rule class has a Match function which
/// has the responsibility to look at the current input
/// (which is managed by a Parser object) and return true or false,
/// depending on whether the current input corresponds
/// to the rule. The Match function will increment the parsers internal
/// pointer as it successfully matches characters, but will also
/// restore the pointer if it fails.
///
/// Some rules have extra responsibilities above and beyond matchin (
/// such as throwing exceptions, or creating an AST) which are described below.
///
public abstract class Rule
{
public abstract bool Match(Parser p);
public override string ToString()
{
return "";
}
}
///
/// This associates a rule with a node in the abstract syntax tree (AST).
/// Even though one could automatically associate each production rule with
/// an AST node it is very cumbersome and inefficient to create and parse.
/// In otherwords the grammar tree is not expected to correspond directly to
/// the syntax tree since much of the grammar is noise (e.g. whitespace).
///
public class AstNodeRule : Rule
{
Rule mRule;
Object mLabel;
public AstNodeRule(Object label, Rule r)
{
Trace.Assert(r != null);
mLabel = label;
mRule = r;
}
public override bool Match(Parser p)
{
p.CreateNode(mLabel);
bool result = mRule.Match(p);
if (result)
{
p.CompleteNode();
}
else
{
p.AbandonNode();
}
return result;
}
public override string ToString()
{
return mLabel.ToString();
}
}
///
/// This rule is neccessary allows you to make recursive references in the grammar.
/// If you don't use this rule in a cyclical rule reference (e.g. A ::= B C, B ::== A D)
/// then you will end up with an infinite loop during grammar generation.
///
public class DelayRule : Rule
{
RuleDelegate mDeleg;
public DelayRule(RuleDelegate deleg)
{
Trace.Assert(deleg != null);
mDeleg = deleg;
}
public override bool Match(Parser p)
{
return mDeleg().Match(p);
}
public override string ToString()
{
// WARNING: this can generate an infinite loops if you have cyclical
// rule references. To break any loops you must either return an empty
// string or make sure that in the cyclical reference is a call to
// AstNodeRule. AstNodeRule.ToString() returns a label.
return mDeleg().ToString();
}
}
///
/// This causes a rule to throw an exception with a particular error message if
/// it fails to match. You would use this rule in a grammar, once you know clearly what
/// you are trying to parse, and failure is clearly an error. In other words you are saying
/// that back-tracking is of no use.
///
public class NoFailRule : Rule
{
Rule mRule;
string msMsg;
public NoFailRule(Rule r, string s)
{
Trace.Assert(r != null);
mRule = r;
msMsg = s;
}
public override bool Match(Parser p)
{
// TODO: make a more descriptive error message.
if (!mRule.Match(p))
throw new Exception(msMsg);
return true;
}
}
///
/// This corresponds to a sequence operator in a PEG grammar. This tries
/// to match a series of rules in order, if one rules fails, then the entire
/// group fails and the parser index is returned to the original state.
///
public class SeqRule : Rule
{
public SeqRule(Rule[] xs)
{
foreach (Rule r in xs)
Trace.Assert(r != null);
mRules = xs;
}
public override bool Match(Parser p)
{
int iter = p.GetPos();
foreach (Rule r in mRules)
{
if (!r.Match(p))
{
p.SetPos(iter);
return false;
}
}
return true;
}
public override string ToString()
{
string result = "(";
if (mRules.Length > 0)
{
result += mRules[0].ToString();
foreach (Rule r in mRules)
{
result += " , " + r.ToString();
}
}
return result + ")";
}
Rule[] mRules;
}
///
/// This rule corresponds to a choice operator in a PEG grammar. This rule
/// is successful if any of the matching rules are successful. The ordering of the
/// rules imply precedence. This means that the grammar will be unambiguous, and
/// differentiates the grammar as a PEG grammar from a context free grammar (CFG).
///
public class ChoiceRule : Rule
{
public ChoiceRule(Rule[] xs)
{
foreach (Rule r in xs)
Trace.Assert(r != null);
mRules = xs;
}
public override bool Match(Parser p)
{
foreach (Rule r in mRules)
{
if (r.Match(p))
return true;
}
return false;
}
public override string ToString()
{
string result = "(";
if (mRules.Length > 0)
{
result += mRules[0].ToString();
foreach (Rule r in mRules)
{
result += " | " + r.ToString();
}
}
return result + ")";
}
Rule[] mRules;
}
///
/// This but attempts to match a optional rule. It always succeeds
/// whether the underlying rule succeeds or not.
///
public class OptRule : Rule
{
public OptRule(Rule r)
{
Trace.Assert(r != null);
mRule = r;
}
public override bool Match(Parser p)
{
mRule.Match(p);
return true;
}
public override string ToString()
{
return mRule.ToString() + "?";
}
Rule mRule;
}
///
/// This attempts to match a rule 0 or more times. It will always succeed,
/// and will match the rule as often as possible. Unlike the * operator
/// in PERL regular expressions, partial backtracking is not possible.
///
public class StarRule : Rule
{
public StarRule(Rule r)
{
Trace.Assert(r != null);
mRule = r;
}
public override bool Match(Parser p)
{
while (mRule.Match(p))
{ }
return true;
}
public override string ToString()
{
return mRule.ToString() + "*";
}
Rule mRule;
}
///
/// This is similar to the StarRule except it matches a rule 1 or more times.
///
public class PlusRule : Rule
{
public PlusRule(Rule r)
{
Trace.Assert(r != null);
mRule = r;
}
public override bool Match(Parser p)
{
if (!mRule.Match(p))
return false;
while (mRule.Match(p))
{ }
return true;
}
public override string ToString()
{
return mRule.ToString() + "+";
}
Rule mRule;
}
///
/// Asssures that no more input exists
///
public class EndOfInputRule : Rule
{
public override bool Match(Parser p)
{
return p.AtEnd();
}
public override string ToString()
{
return "_eof_";
}
}
///
/// This returns true if a rule can not be matched.
/// It never advances the parser.
///
public class NotRule : Rule
{
public NotRule(Rule r)
{
Trace.Assert(r != null);
mRule = r;
}
public override bool Match(Parser p)
{
int pos = p.GetPos();
if (mRule.Match(p))
{
p.SetPos(pos);
return false;
}
Trace.Assert(p.GetPos() == pos);
return true;
}
public override string ToString()
{
return "!" + mRule.ToString();
}
Rule mRule;
}
///
/// Attempts to match a specific character.
///
public class SingleCharRule : Rule
{
public SingleCharRule(char x)
{
mData = x;
}
public override bool Match(Parser p)
{
if (p.AtEnd()) return false;
if (p.GetChar() == mData)
{
p.GotoNext();
return true;
}
return false;
}
public override string ToString()
{
return "";
//return mData.ToString();
}
char mData;
}
///
/// Attempts to match a sequence of characters.
///
public class CharSeqRule : Rule
{
public CharSeqRule(string x)
{
mData = x;
}
public override bool Match(Parser p)
{
if (p.AtEnd()) return false;
int pos = p.GetPos();
foreach (char c in mData)
{
if (p.GetChar() != c)
{
p.SetPos(pos);
return false;
}
p.GotoNext();
}
return true;
}
public override string ToString()
{
return "[" + mData + "]";
}
string mData;
}
///
/// Matches any character and advances the parser, unless it is
/// at the end of the input.
///
public class AnyCharRule : Rule
{
public override bool Match(Parser p)
{
if (!p.AtEnd())
{
p.GotoNext();
return true;
}
return false;
}
public override string ToString()
{
return ".";
}
}
///
/// Returns true and advances the parser if the current character matches any
/// member of a specific set of characters.
///
public class CharSetRule : Rule
{
public CharSetRule(string s)
{
mData = s;
}
public override bool Match(Parser p)
{
if (p.AtEnd())
return false;
foreach (char c in mData)
{
if (c == p.GetChar())
{
p.GotoNext();
return true;
}
}
return false;
}
public override string ToString()
{
return "[" + mData + "]";
}
string mData;
}
///
/// Returns true and advances the parser if the current character matches any
/// member of a specific range of characters.
///
public class CharRangeRule : Rule
{
public CharRangeRule(char first, char last)
{
mFirst = first;
mLast = last;
Trace.Assert(mFirst < mLast);
}
public override bool Match(Parser p)
{
if (p.AtEnd()) return false;
if (p.GetChar() >= mFirst && p.GetChar() <= mLast)
{
p.GotoNext();
return true;
}
return false;
}
public override string ToString()
{
return "[" + mFirst.ToString() + ".." + mLast.ToString() + "]";
}
char mFirst;
char mLast;
}
///
/// Matches a rule over and over until a terminating rule can be
/// successfully matched.
///
public class WhileNotRule : Rule
{
public WhileNotRule(Rule elem, Rule term)
{
mElem = elem;
mTerm = term;
}
public override bool Match(Parser p)
{
int pos = p.GetPos();
while (!mTerm.Match(p))
{
if (!mElem.Match(p))
{
p.SetPos(pos);
return false;
}
}
return true;
}
public override string ToString()
{
return "(" + mElem.ToString() + "* !" + mTerm.ToString() + ")";
}
Rule mElem;
Rule mTerm;
}
public static Rule EndOfInput() { return new EndOfInputRule(); }
public static Rule Delay(RuleDelegate r) { return new DelayRule(r); }
public static Rule SingleChar(char c) { return new SingleCharRule(c); }
public static Rule CharSeq(string s) { return new CharSeqRule(s); }
public static Rule AnyChar() { return new AnyCharRule(); }
public static Rule NotChar(char c) { return Seq(Not(SingleChar(c)), AnyChar()); }
public static Rule CharSet(string s) { return new CharSetRule(s); }
public static Rule CharRange(char first, char last) { return new CharRangeRule(first, last); }
public static Rule AstNode(Object label, Rule x) { return new AstNodeRule(label, x); }
public static Rule NoFail(Rule r, string s) { return new NoFailRule(r, s); }
public static Rule Seq(Rule x0, Rule x1) { return new SeqRule(new Rule[] { x0, x1 }); }
public static Rule Seq(Rule x0, Rule x1, Rule x2) { return new SeqRule(new Rule[] { x0, x1, x2 }); }
public static Rule Seq(Rule x0, Rule x1, Rule x2, Rule x3) { return new SeqRule(new Rule[] { x0, x1, x2, x3 }); }
public static Rule Seq(Rule x0, Rule x1, Rule x2, Rule x3, Rule x4) { return new SeqRule(new Rule[] { x0, x1, x2, x3, x4 }); }
public static Rule Seq(Rule x0, Rule x1, Rule x2, Rule x3, Rule x4, Rule x5) { return new SeqRule(new Rule[] { x0, x1, x2, x3, x4, x5 }); }
public static Rule Seq(Rule x0, Rule x1, Rule x2, Rule x3, Rule x4, Rule x5, Rule x6) { return new SeqRule(new Rule[] { x0, x1, x2, x3, x4, x5, x6 }); }
public static Rule Choice(Rule x0, Rule x1) { return new ChoiceRule(new Rule[] { x0, x1 }); }
public static Rule Choice(Rule x0, Rule x1, Rule x2) { return new ChoiceRule(new Rule[] { x0, x1, x2 }); }
public static Rule Choice(Rule x0, Rule x1, Rule x2, Rule x3) { return new ChoiceRule(new Rule[] { x0, x1, x2, x3 }); }
public static Rule Choice(Rule x0, Rule x1, Rule x2, Rule x3, Rule x4) { return new ChoiceRule(new Rule[] { x0, x1, x2, x3, x4 }); }
public static Rule Opt(Rule x) { return new OptRule(x); }
public static Rule Star(Rule x) { return new StarRule(x); }
public static Rule Plus(Rule x) { return new PlusRule(x); }
public static Rule Not(Rule x) { return new NotRule(x); }
public static Rule WhileNot(Rule elem, Rule term) { return new WhileNotRule(elem, term); }
public static Rule NL() { return CharSet("\n"); }
public static Rule LowerCaseLetter() { return CharRange('a', 'z'); }
public static Rule UpperCaseLetter() { return CharRange('A', 'Z'); }
public static Rule Letter() { return Choice(LowerCaseLetter(), UpperCaseLetter()); }
public static Rule Digit() { return CharRange('0', '9'); }
public static Rule HexDigit() { return Choice(Digit(), Choice(CharRange('a', 'f'), CharRange('A', 'F'))); }
public static Rule BinaryDigit() { return CharSet("01"); }
public static Rule IdentFirstChar() { return Choice(SingleChar('_'), Letter()); }
public static Rule IdentNextChar() { return Choice(IdentFirstChar(), Digit()); }
public static Rule Ident() { return Seq(IdentFirstChar(), Star(IdentNextChar())); }
public static Rule EOW() { return Not(IdentNextChar()); }
public static Rule DelimitedGroup(String x, Rule r, String y) { return Seq(CharSeq(x), Star(r), NoFail(CharSeq(y), "expected " + y)); }
}
}