1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
//@ # Parsing
//@
//@
//@ Now that we've turned the source code into tokens we can construct a more
//@ computer-friendly representation for the program. This representation is
//@ often called an *Abstract Syntax Tree* because it's a high-level tree
//@ datastructure which reflects a program's syntax.
//@
//@
//@ ## The General Idea
//@
//@ Before we start with parsing, lets look at an example chunk of Delphi code
//@ to get a feel for the language. A *unit file* is the basic building block of a
//@ Delphi program, analogous to a `*.c` file. The `Main()` function is typically
//@ elsewhere in GUI programs because an application's endpoint is typically 
//@ managed by the GUI framework or IDE.
//@
//@
//@ ```delphi
//@ unit Unit1;
//@
//@ interface
//@
//@ uses
//@   Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms,
//@   Dialogs, StdCtrls;
//@
//@ type
//@   TForm1 = class(TForm)
//@     Label1: TLabel;      // The label we have added
//@     Button1: TButton;    // The button we have added
//@     procedure Button1Click(Sender: TObject);
//@   private
//@     { private declarations }
//@   public
//@     { public declarations }
//@   end;
//@
//@ var
//@   Form1: TForm1;
//@
//@ implementation
//@
//@ {$R *.dfm}
//@
//@ // The button action we have added
//@ procedure TForm1.Button1Click(Sender: TObject);
//@ begin
//@   Label1.Caption := 'Hello World';    // Label changed when button pressed
//@ end;
//@
//@ end.
//@ ```
//@
//@ At a very high level, a unit file consists of a `unit` statement declaring the 
//@ unit's name, followed by the `interface` (kinda like a `*.h` file) then an
//@ `implementation` section, before ending with a `end.`.
//@
//@ There's a formal language used to express a language's grammar called 
//@ *Backus–Naur form*. That previous paragraph would translate to something like 
//@ the following:
//@
//@ ```ebnf
//@ file        = unit_decl interface implementation "end.";
//@ unit_decl   = "unit" unit_name SEMICOLON;
//@ unit_name   = WORD;
//@ interface   = "interface" uses types vars;
//@ uses        = "uses" WORD ("," WORD)* SEMICOLON
//@              | ε;
//@ types       = "type" type_decl*;
//@ vars        = "var" var_decl*;
//@ ```
//@
//@ With the terminals (`WORD`, `SEMICOLON`, and friends) being their usual selves.
//@
//@ Delphi has a pretty simple syntax, so we're going to use a standard recursive
//@ descent parser. This is just an object which has a method roughly corresponding 
//@ to each rule in the language's grammar.
//@
//@
//@ ## The Parser Object
//@
//@ As usual, before we can do anything else we're going to have to import a couple
//@ dependencies.

use std::rc::Rc;

use lex::{Token, TokenKind};
use codemap::{Span, FileMap};
use parse::ast::{Literal, LiteralKind, Ident, DottedIdent};
use errors::*;

//@ The `Parser` itself just contains the tokens and their corresponding `FileMap`.

/// A parser for turning a stream of tokens into a Abstract Syntax Tree.
#[derive(Debug)]
pub struct Parser {
  tokens: Vec<Token>,
  filemap: Rc<FileMap>,
  current_index: usize,
}

impl Parser {
  /// Create a new parser.
  pub fn new(tokens: Vec<Token>, filemap: Rc<FileMap>) -> Parser {
    let current_index = 0;
    Parser { tokens, filemap, current_index }
  }

  /// Peek at the current token.
  fn peek(&self) -> Option<&TokenKind> {
    self.tokens.get(self.current_index).map(|t| &t.kind)
  }

  /// Get the current token, moving the index along one.
  fn next(&mut self) -> Option<&Token> {
    let tok = self.tokens.get(self.current_index);

    if tok.is_some() {
      self.current_index += 1;
    }

    tok
  }
}

//@ We'll implement the various grammar rules from the bottom up. Meaning we'll 
//@ start with the very basics like expressions, then build things up until we
//@ get to the overall program.
//@
//@ First up lets have a go at parsing `Literals`. We do it in two steps, first
//@ you peek at the next token to make sure it's a kind you expect, then you
//@ unpack the token and convert it into it's equivalent AST node. A lot of the
//@ pattern matching boilerplate can be minimised with the judicious use of macros.

impl Parser {
  fn parse_literal(&mut self) -> Result<Literal> {
    match self.peek() {
      Some(&TokenKind::Integer(_)) | 
      Some(&TokenKind::Decimal(_)) | 
      Some(&TokenKind::QuotedString(_)) => {},
      Some(_) => bail!("Expected a literal"),
      None => bail!(ErrorKind::UnexpectedEOF),
    };

    let next = self.next().expect("unreachable");
    let lit_kind = match next.kind {
      TokenKind::Integer(i) => LiteralKind::Integer(i),
      TokenKind::Decimal(d) => LiteralKind::Decimal(d),
      TokenKind::QuotedString(ref s) => LiteralKind::String(s.clone()),
      ref other => panic!("Unreachable token kind: {:?}", other),
    };

    Ok(Literal {
      span: next.span,
      kind: lit_kind
    })
  }
}

//@ Like the tokenizing module, we're going to need to write lots of tests to 
//@ check our parser recognises things as we expect them to. Unfortunately the
//@ types and syntactic structures used will be slightly different, so we'll
//@ use macros to abstract away a lot of the boilerplate.

macro_rules! parser_test {
  ($name:ident, $method:ident, $src:expr => $should_be:expr) =>  {
    #[cfg(test)]
    #[test]
    fn $name() {
      // create a codemap and tokenize our input string
      let mut codemap = $crate::codemap::CodeMap::new();
      let filemap = codemap.insert_file("dummy.pas", $src);
      let tokenized = $crate::lex::tokenize(filemap.contents())
        .chain_err(|| "Tokenizing failed")
        .unwrap();
      let tokens = filemap.register_tokens(tokenized);

      let should_be = $should_be;

      let mut parser = Parser::new(tokens, filemap);
      let got = parser.$method().unwrap();

      assert_eq!(got, should_be);
    }
  }
}

//@ Now we have our basic test harness set up, lets see if literal parsing works.

parser_test!(integer_literal, parse_literal, "123" => LiteralKind::Integer(123));
parser_test!(parse_float_literal, parse_literal, "12.3" => LiteralKind::Decimal(12.3));
// TODO: re-enable this when string parsing is implemented
// parser_test!(parse_string_literal, parse_literal, "'12.3'" => LiteralKind::String("12.3".to_string()));

//@ Another easy thing to implement is parsing identifiers and dotted identifiers 
//@ (e.g. `foo.bar.baz`). To recognise a dotted identifier you first look for one
//@ identifier, then keep trying to take a pair of dots and idents until there are
//@ no more.


impl Parser {
  fn parse_ident(&mut self) -> Result<Ident> {
    match self.peek() {
      Some(&TokenKind::Identifier(_)) => {},
      _ => bail!("Expected an identifier"),
    }

    let next = self.next().unwrap();

    if let TokenKind::Identifier(ref ident) = next.kind {
      Ok(Ident {
        span: next.span,
        name: ident.clone(),
      })
    } else {
      unreachable!()
    }
  }

  fn parse_dotted_ident(&mut self) -> Result<DottedIdent> {
    let first = self.parse_ident()?;
    let mut parts = vec![first];

    while self.peek() == Some(&TokenKind::Dot) {
      let _ = self.next();
      let next = self.parse_ident()?;
      parts.push(next);
    }

    // the span for a dotted ident should be the union of the spans for
    // each of its components.
    let span = parts.iter()
                    .skip(1)
                    .fold(parts[0].span, |l, r| self.filemap.merge(l, r.span));

    Ok(DottedIdent { span, parts })    
  }
}

//@ Using our `parser_test!()` macro makes these a piece of cake to test.

parser_test!(parse_a_basic_ident, parse_ident, "foo" => "foo");
parser_test!(parse_a_dotted_ident, parse_dotted_ident, "foo.bar.baz" => ["foo", "bar", "baz"]);
parser_test!(parse_a_single_ident_as_dotted, parse_dotted_ident, "foo" => ["foo"]);