Basil

Contents:
Home
Synopsis
Config file format
Grammar file format
Symbol attributes
Getting started
Visitors
Simulator
Examples
Download

Getting started

To build a parser, follow these steps:
  1. Write the grammar.

    The basil simulator can be used to test the grammar.

  2. Write the lexer.

    The lexer must derive from the class basl::Lexer:

    // basl_Lexer.lzz
    //
    
    $hdr
    $end
    
    $src
    $end
    
    // basil
    namespace basl
    {
      // types
      class Token;
    
      // Lexer
      class Lexer
      {
      public:
        // destructor
        virtual ~ Lexer () { }
    
        // get next token
        virtual Token getNextToken (int lex_state) = 0;
      }
    }
    
    The lexer must override the function getNextToken. The lex_state argument is the lexical state of the current parsing state. It can be used to tokenize context sensitive tokens. The function must return the next input token. The token type is defined in basl_Token.lzz:

    // basl_Token.lzz
    //
    
    $hdr
    #include "util_Ident.h"
    #include "util_Loc.h"
    $end
    
    $src
    $end
    
    // basil
    namespace basl
    {
      // Token
      class Token
      {
        // number (-1 if not set)
        int m_number;
    
        // location
        util::Loc m_loc;
        
        // lexeme
        util::Ident m_lexeme;
    
      public:
        // default constructor
        Token ()
          : m_number (-1)
        {
        }
    
        // constructor
        Token (int number, util::Loc const & loc, util::Ident const & lexeme)
          : m_number (number), m_loc (loc), m_lexeme (lexeme)
        {
        }
    
        // true if set
        bool isSet () const
        {
          return m_number != -1;
        }
    
        // get number
        int getNumber () const
        {
          return m_number;
        }
    
        // get lexeme
        util::Ident const & getLexeme () const
        {
          return m_lexeme;
        }
    
        // get location
        util::Loc const & getLoc () const
        {
          return m_loc;
        }
      }
    }
    
    The location and lexeme types can be changed to suit the lexer. Since the location and lexeme types are auxiliary to the parser they are defined in the util namespace.

    basil generates the token numbers in the file TokenNumber.lzz. The numbers are zero and greater. The number EOT must be the number of the last token. EOT is always 0.

    If you choose to use util::Ident as the type of the lexemes, then the the class util::IdentTable can be used to create and manage lexemes.

  3. Write the parser.

    The parser must derive from the class basl::Parser:

    // basl_Parser.lzz
    //
    
    $hdr
    $end
    
    $src
    $end
    
    // basil
    namespace basl
    {
      // types
      class Nonterm;
    
      // Parser
      class Parser
      {
      public:
        // destructor
        virtual ~ Parser () { }
    
        // on node, returns false if syntax error
        virtual bool onNode (Nonterm & nonterm) = 0;
      }
    }
    
    The parser must override the function onNode. This function is called when a node is created. Semantic actions can be performed on the node. If the function returns true, then the parser will continue. Otherwise, the parser will backtrack; if the parser is not guessing, then it will report a syntax error and recover if possible.

  4. Call the parser.

    Parsing begins by calling the basl::parse function. This function is defined in basl_Parse.lzz:

    // basl_Parse.lzz
    //
    
    $hdr
    $end
    
    $src
    // ...
    $end
    
    // basil
    namespace basl
    {
      // types
      struct LRData; 
      class Lexer;
      class Parser;
      class SyntaxTree;
      class ErrorRec;
    
      // parse, returns false if syntax errors
      bool parse (LRData const & data, Lexer & lexer, Parser & parser,
          int start_state, ErrorRec const & error_rec, bool trace,
          SyntaxTree & tree)
      {
        // ...
      }
    }
    
    The first argument to the function must be the value returned by getParserData.

    The second and third arguments are references to the lexer and parser respectively.

    The fourth argument, start_state, is the state at which to begin parsing. State 0 corresponds to the first start rule, 1 corresponds to the second start rule, and so on.

    error_rec is an object that contains the syntax error recover strategies. Recover strategies can be created by calling the member functions of basl::ErrorRec:

    // basl_ErrorRec.lzz
    //
    // error recovery
    //
    
    $hdr
    #include "basl_FreeTokenVector.h"
    $end
    
    $src
    // ...
    $end
    
    // basil
    namespace basl
    {
      // ErrorRec
      class ErrorRec
      {
        // ...
    
      public:
        // insert some tokens
        void insertSome (FreeTokenVector const & free_token_set)
        {
          // ...
        }
    
        // discard max_discard tokens
        void discardSome (int max_discard)
        {
          // ...
        }
    
        // discard token number
        void discardOne (int number)
        {
          // ...
        }
    
        // discard max_discard tokens and insert some tokens
        void replaceSome (int max_discard,
            FreeTokenVector const & free_token_set)
        {
          // ...
        }
      
        // discard token number and insert some tokens 
        void replaceOne (int number, FreeTokenVector const & free_token_set)
        {
          // ...
        }
    
        // ...
      }
    }
    
    The parser can attempt any number of error recover strategies. It will try them in the order they are created. The basil simulator can be used to experiment with the recover strategies.

    A free token is simply a token without a location:

    // basl_FreeToken.lzz
    //
    // free token (token without a location)
    //
    
    $hdr
    #include "util_Ident.h"
    $end
    
    $src
    $end
    
    // basl
    namespace basl
    {
      // FreeToken
      struct FreeToken
      {
        // number
        int number;
    
        // lexeme
        util::Ident lexeme;
    
        // constructor
        FreeToken (int number, util::Ident const & lexeme)
          : number (number), lexeme (lexeme)
        {
        }
      }
    }
    
    The sixth argument to basl::parse, trace, is a flag. If true the parser will print to standard output all state transitions.

    The last argument, tree, is an output argument. If the parser encouters no syntax errors, then tree will be set to the final syntax tree. basl::SyntaxTree is simply a manager for nodes:

    // basl_SyntaxTree.lzz
    //
    // manages nodes in syntax tree
    // nodes are reused, allocated as needed and deleted at end of program
    //
    
    $hdr
    $end
    
    $src
    #include "basl_Node.h"
    #include "basl_NodePool.h"
    $end
    
    // basl
    namespace basl
    {
      // types
      struct Node;
    
      // SyntaxTree
      class SyntaxTree
      {
        // node
        Node * m_node;
    
      public:
        // constructor
        SyntaxTree (Node * node = 0)
          : m_node (node)
        {
          addNodeRef (m_node);
        }
    
        // copy constructor
        SyntaxTree (SyntaxTree const & tree)
          : m_node (tree.m_node)
        {
          addNodeRef (m_node);
        }
      
        // destructor
        ~ SyntaxTree ()
        {
          remNodeRef (m_node);
        }
    
        // copy assignment
        SyntaxTree & operator = (SyntaxTree const & tree)
        {
          addNodeRef (tree.m_node);
          remNodeRef (m_node);
          m_node = tree.m_node;
          return * this;
        }
    
        // true if empty
        inline bool isEmpty () const
        {
          return m_node == 0;
        }
    
        // get node
        inline Node & getNode () const
        {
          return * m_node;
        }
    
        // pointer
        inline Node * pointer () const
        {
          return m_node;
        }
      }
    }
    
    The function getNode returns a reference to the root node. basl::Node simply encapsulates a token and a nonterminal. Since all nodes are the same size, nodes are easily recycled.