package org.himinbi.parser; import java.util.Hashtable; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Set; import java.util.List; import java.util.Collection; import java.util.Collections; import java.util.Vector; import java.util.Stack; import java.util.Map; import java.io.IOException; import org.apache.log4j.Logger; import org.apache.log4j.Level; import org.himinbi.tokenizer.Token; import org.himinbi.tokenizer.TokenStream; public class LLParser extends ParserBase { /** * Actions represents the parse table. */ Map actions; /** * Element tro begin the parse with. */ Element startSymbol; Grammar grammar; /** * Log4j logging category */ static Logger log = Logger.getLogger(LLParser.class); public LLParser() { } public void setGrammar(Grammar grammar) { actions = generateParseTable(grammar); this.grammar = grammar; startSymbol = grammar.getStartSymbol(); } public Map getParseTable() { return actions; } /** * Generates an LL parse table with The keys are non-terminals and * each returns another map that is keyed with terminals and returns * a Rule. */ public static Map generateParseTable(Grammar grammar) { /** * The heart of a LL tabular parser is the parse table which * tells which grammar rule to choose next based on the * current grammar rule and the next terminal in the stream. * * The parse table is generated by first creating two * collections of terminals: * first(A) is the set of all terminals that can appear first * in an expansion of the non-terminal A * follow(A) is the set of all terminals that can appear * following the non-terminal A. This does not include * the empty terminal */ /** * To properly find the first and follow sets it is necessary to * which terminals may expand to the empty terminal. This includes * not only nonterminals that expand to the empty nonterminal, * but also nonterminals that expand to a list of nonterminals * that expand to the empty terminal. * * To find which nonterminals may be empty iterate over all * the rules. If a nonterminal expands to the empty terminal * add it to the list or if consists entiery on nonterminals * that expand to the empty terminal add it. Repeat the iterations * until an entire iteration is done without making a change. */ Map firstSets = new HashMap(); Map followSets = new HashMap(); /** * It is known that all the elements of the first and follow * sets are terminals, so they are stored as strings for * easier access without ant loss of information. * * The first for any terminal is itself. */ Iterator allTerminals = grammar.getTerminals().iterator(); while(allTerminals.hasNext()) { Element next = (Element)allTerminals.next(); firstSets.put(next, Collections.singleton(next.getValue())); } /** * Every nonterminal has a first and follow set though it * may be empty for some elements (frequently follows is * empty for the start symbol.) */ Iterator allNonterminals = grammar.getNonterminals().iterator(); while(allNonterminals.hasNext()) { Element next = (Element)allNonterminals.next(); firstSets.put(next, new HashSet()); followSets.put(next, new HashSet()); } Collection emptyElements = getEmptyElements(grammar); if(log.isDebugEnabled()) { log.debug("Empties:"); Iterator empties = emptyElements.iterator(); while(empties.hasNext()) { log.debug(" " + empties.next()); } } /** * To generate the firsts again iterate over all the rules. * Go through the expansion until then end is reached or a * nonempty element is found. Add all of the firsts for * each elements passed should be added. */ boolean changesMade; do { changesMade = false; Iterator rules = grammar.getRules().iterator(); while(rules.hasNext()) { Rule rule = (Rule)rules.next(); Set firsts = (Set)firstSets.get(rule.getProduct()); Iterator elements = rule.getExpansion().iterator(); Element nextElement; do { nextElement = (Element)elements.next(); if(nextElement != Element.EMPTY_TERMINAL) { // The empty terminal does not appear in any first set changesMade = (changesMade || firsts.addAll((Set)firstSets.get(nextElement))); } } while(emptyElements.contains(nextElement) && elements.hasNext()); } } while(changesMade); if(log.isDebugEnabled()) { printSets("First Sets:", firstSets, Level.DEBUG); } /** * The follows are found similarly to the firsts. * Again all of the rules are iterated over. * If an element, A, is a non-terminal then: * If the next element, B, is set then add the firsts * for B to the follows for A. * If there is no next element then add the follows * for the non-terminal that this rule is an expansion * of to the follows for A. * If B can be empty then repeat the process for the * element following B. * Continue doing this until all the rules are interated * over without making any changes. */ do { changesMade = false; Iterator rules = grammar.getRules().iterator(); while(rules.hasNext()) { Rule rule = (Rule)rules.next(); List elements = rule.getExpansion(); int currentIndex = elements.size() - 1; Set follows = new HashSet(); while(currentIndex >= 0) { /** * The easiest way to handle this is to walk backward * through the expansion. Start at the end of the * list with the follows for the product of the rule. * Step backward til the first element is reached. * Each time a step backward is taken if the element * that is being stepped over can be empty then leave * the follows that are already present. Either way * add the firsts for the element being stepped over. */ Element currentElement = (Element)elements.get(currentIndex); // Terminals don't need to track follows if(currentElement.getType() == Element.NONTERMINAL_TYPE) { if(currentIndex == elements.size() - 1) { // If we are at the end of a rule then the follows // for this element are the things that can follow // the nonterminal that this is an expansion of follows.addAll((Set)followSets.get(rule.getProduct())); } else { // Otherwise add the first things that can be seen // in the expansion of what follows this element follows.addAll((Set)firstSets. get(elements.get(currentIndex + 1))); } changesMade = (changesMade || ((Set)followSets.get (currentElement)).addAll(follows)); // Whenever an element may be empty then all of the // current follows will also hold for the next element if(!emptyElements.contains(currentElement)) { follows.clear(); } } currentIndex--; } } } while(changesMade); if(log.isDebugEnabled()) { printSets("Follow Sets:", followSets, Level.DEBUG); } /** * Using the first and follow sets it is possible to generate the * parse table for this grammar. * * Recall: * first(A) is the set of all terminals that can appear first in an * expansion of the non-terminal A including the empty terminal * follow(A) is the set of all terminals that can appear following an * expansion of the non-terminal A * * Using these we can define: * action(A, a) which is the rule to use to expand the non-terminal A * given the next terminal to be processed, a * * This is done by iterating over all the rules. For each rule, R, * which is an expansion of A: * If R starts with a terminal, b, then action(A, b) = R * If R starts with a non-terminal, B, then action(A, b) = R for each * terminal, b in first(B). Additionally, if first(B) contains * the empty terminal then then action(A, c) = R for each terminal, * c in follow(B) */ Iterator rules = grammar.getRules().iterator(); Hashtable actions = new Hashtable(); while(rules.hasNext()) { Rule rule = (Rule)rules.next(); Element product = (Element)rule.getProduct(); Hashtable action = (Hashtable)actions.get(product); if(action == null) { action = new Hashtable(); actions.put(product, action); } Element first = (Element)rule.getExpansion().get(0); Vector terminalSet = new Vector(); if(first != Element.EMPTY_TERMINAL) { Set firsts = (Set)firstSets.get(first); terminalSet.addAll(firsts); } if(emptyElements.contains(rule)) { Set follows = (Set)followSets.get(product); if(follows != null) { terminalSet.addAll(follows); } else { throw new ParseError("No follow for " + product); } } Iterator terminals = terminalSet.iterator(); while(terminals.hasNext()) { String terminal = (String)terminals.next(); if(action.containsKey(terminal)) { throw new IllegalArgumentException ("Error: Multiple start terminals found for: " + product + " on " + terminal + " " + "(" + action.get(terminal) + "/" + rule); } action.put(terminal, rule); } } return actions; } public static Collection getEmptyElements(Grammar grammar) { Collection emptyElements = new HashSet(); // All other empty elements arise from this. emptyElements.add(Element.EMPTY_TERMINAL); boolean changesMade; do { changesMade = false; Iterator rules = grammar.getRules().iterator(); while(rules.hasNext()) { Rule rule = (Rule)rules.next(); Iterator expansion = rule.getExpansion().iterator(); boolean empty = true; Element nextElement = null; while(empty && expansion.hasNext()) { nextElement = (Element)expansion.next(); empty = (empty && emptyElements.contains(nextElement)); } if(empty && nextElement != null) { changesMade = (changesMade || emptyElements.add(rule) || emptyElements.add(rule.getProduct())); } } } while(changesMade); return emptyElements; } /** * Checks if any grammar rules with the same product have the same initial elements * in their completion. If there are then it attempts to modify the grammar using * left factorization so as to make the grammar suitable for LL(1) processing. * * Left factorization looks like this. For example for this grammar rule: * * A = ('b', B) | ('b', C); * * A is ambiguous for a LL(1) parse because it is not possible to decide what to * expand A to based on the next terminal, 'b'. This rule can be replaced by * another rule that will produce an equivalent language: * * A = 'b', (B | C); * */ public static void leftFactor(Grammar grammar) { } protected static void printSets(String title, Map sets, Level level) { log.log(level, title); Iterator keys = sets.keySet().iterator(); while(keys.hasNext()) { StringBuffer set = new StringBuffer(); Element key = (Element)keys.next(); set.append(key + " -> "); Iterator terminals = ((Set)sets.get(key)).iterator(); while(terminals.hasNext()) { String element = (String)terminals.next(); set.append("'" + element + "'"); if(terminals.hasNext()) { set.append(", "); } } log.log(level, " " + set.toString()); } } public void parse(TokenStream tokens) throws ParseException, IOException { fireParseStarted(); if(!tokens.hasNext()) { log.warn("Parsed empty token stream"); } else { Stack expansions = new Stack(); Element currentProduct = grammar.getStartSymbol(); if(currentProduct == null) { throw new NullPointerException("No start symbol"); } Iterator expansion = null; Token nextToken = tokens.nextToken(); boolean error = false; do { // Descent needed is marked by a null expansion if(expansion == null) { Rule nextRule = (Rule)((Map)actions.get(currentProduct)). get(nextToken.getType()); if(nextRule == null) { throw new IOException("No rule for " + nextToken.getType() + " from " + currentProduct.getValue()); } expansion = nextRule.getExpansion().iterator(); if(!currentProduct.isAnonymous()) { fireRuleStarted(currentProduct.getValue()); } } Element nextElement = null; // Skip over any empty terminals other then a final one while(expansion.hasNext() && (nextElement = (Element)expansion.next()) == Element.EMPTY_TERMINAL); // If the next element is a terminal try to eat // a token off the stream if(nextElement.getType() == Element.TERMINAL_TYPE) { if(nextToken.getType() .equals(nextElement.getValue())) { fireTerminal(nextToken.getType(), nextToken.getValue()); nextToken = tokens.hasNext() ? tokens.nextToken() : null; } // Some rules will end with an empty terminal // (often the whole expansion is an empty terminal) // The parser needs to be allowed to bubble up a // level to match the next token else if(nextElement != Element.EMPTY_TERMINAL) { ParseError parseError = new ParseError("Found terminal: " + nextToken.getType() + " expected " + nextElement.getValue()); fireParseError(parseError); error = true; } } // If the next element is a nonterminal then expand it else if(nextElement.getType() == Element.NONTERMINAL_TYPE) { expansions.push(new Object[] { expansion, currentProduct }); currentProduct = nextElement; expansion = null; } else { ParseError parseError = new ParseError("Unknown element type: " + nextElement); fireParseError(parseError); error = true; } // Bubble back up after any finished expansions while(expansion != null && !expansion.hasNext()) { if(!currentProduct.isAnonymous()) { fireRuleFinished(currentProduct.getValue()); } Object[] pop = (Object[])expansions.pop(); expansion = (Iterator)pop[0]; currentProduct = (Element)pop[1]; } // Next token goes to null when there are no more } while(nextToken != null && !error); // Check to see if all the expansions were used // If they weren't, see if they can be finished // using empty elements to get to an end if(!error && expansions.size() > 0) { Collection emptyElements = LLParser.getEmptyElements(grammar); boolean done = false; do { if(expansion == null && expansions.size() > 0) { Object[] pop = (Object[])expansions.pop(); expansion = (Iterator)pop[0]; currentProduct = (Element)pop[1]; } Element testElement = null; while(expansion.hasNext() && testElement == null) { testElement = (Element)expansion.next(); if(emptyElements.contains(testElement)) { testElement = null; } } // Each empty element is nulled, so if the element // is set here then it was nonempty and is an error if(testElement != null) { ParseError parseError = new ParseError ("Incomplete parse: unable to fulfill element: " + testElement); fireParseError(parseError); error = true; } // Otherwise the expansion was completed else { if(!currentProduct.isAnonymous()) { fireRuleFinished(currentProduct.getValue()); } expansion = null; } } while(expansions.size() > 0 && !error); } } fireParseFinished(); } }