1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245
//@ # Parsing //@ //@ //@ Now that we've turned the source code into tokens we can construct a more //@ computer-friendly representation for the program. This representation is //@ often called an *Abstract Syntax Tree* because it's a high-level tree //@ datastructure which reflects a program's syntax. //@ //@ //@ ## The General Idea //@ //@ Before we start with parsing, lets look at an example chunk of Delphi code //@ to get a feel for the language. A *unit file* is the basic building block of a //@ Delphi program, analogous to a `*.c` file. The `Main()` function is typically //@ elsewhere in GUI programs because an application's endpoint is typically //@ managed by the GUI framework or IDE. //@ //@ //@ ```delphi //@ unit Unit1; //@ //@ interface //@ //@ uses //@ Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms, //@ Dialogs, StdCtrls; //@ //@ type //@ TForm1 = class(TForm) //@ Label1: TLabel; // The label we have added //@ Button1: TButton; // The button we have added //@ procedure Button1Click(Sender: TObject); //@ private //@ { private declarations } //@ public //@ { public declarations } //@ end; //@ //@ var //@ Form1: TForm1; //@ //@ implementation //@ //@ {$R *.dfm} //@ //@ // The button action we have added //@ procedure TForm1.Button1Click(Sender: TObject); //@ begin //@ Label1.Caption := 'Hello World'; // Label changed when button pressed //@ end; //@ //@ end. //@ ``` //@ //@ At a very high level, a unit file consists of a `unit` statement declaring the //@ unit's name, followed by the `interface` (kinda like a `*.h` file) then an //@ `implementation` section, before ending with a `end.`. //@ //@ There's a formal language used to express a language's grammar called //@ *Backus–Naur form*. That previous paragraph would translate to something like //@ the following: //@ //@ ```ebnf //@ file = unit_decl interface implementation "end."; //@ unit_decl = "unit" unit_name SEMICOLON; //@ unit_name = WORD; //@ interface = "interface" uses types vars; //@ uses = "uses" WORD ("," WORD)* SEMICOLON //@ | ε; //@ types = "type" type_decl*; //@ vars = "var" var_decl*; //@ ``` //@ //@ With the terminals (`WORD`, `SEMICOLON`, and friends) being their usual selves. //@ //@ Delphi has a pretty simple syntax, so we're going to use a standard recursive //@ descent parser. This is just an object which has a method roughly corresponding //@ to each rule in the language's grammar. //@ //@ //@ ## The Parser Object //@ //@ As usual, before we can do anything else we're going to have to import a couple //@ dependencies. use std::rc::Rc; use lex::{Token, TokenKind}; use codemap::{Span, FileMap}; use parse::ast::{Literal, LiteralKind, Ident, DottedIdent}; use errors::*; //@ The `Parser` itself just contains the tokens and their corresponding `FileMap`. /// A parser for turning a stream of tokens into a Abstract Syntax Tree. #[derive(Debug)] pub struct Parser { tokens: Vec<Token>, filemap: Rc<FileMap>, current_index: usize, } impl Parser { /// Create a new parser. pub fn new(tokens: Vec<Token>, filemap: Rc<FileMap>) -> Parser { let current_index = 0; Parser { tokens, filemap, current_index } } /// Peek at the current token. fn peek(&self) -> Option<&TokenKind> { self.tokens.get(self.current_index).map(|t| &t.kind) } /// Get the current token, moving the index along one. fn next(&mut self) -> Option<&Token> { let tok = self.tokens.get(self.current_index); if tok.is_some() { self.current_index += 1; } tok } } //@ We'll implement the various grammar rules from the bottom up. Meaning we'll //@ start with the very basics like expressions, then build things up until we //@ get to the overall program. //@ //@ First up lets have a go at parsing `Literals`. We do it in two steps, first //@ you peek at the next token to make sure it's a kind you expect, then you //@ unpack the token and convert it into it's equivalent AST node. A lot of the //@ pattern matching boilerplate can be minimised with the judicious use of macros. impl Parser { fn parse_literal(&mut self) -> Result<Literal> { match self.peek() { Some(&TokenKind::Integer(_)) | Some(&TokenKind::Decimal(_)) | Some(&TokenKind::QuotedString(_)) => {}, Some(_) => bail!("Expected a literal"), None => bail!(ErrorKind::UnexpectedEOF), }; let next = self.next().expect("unreachable"); let lit_kind = match next.kind { TokenKind::Integer(i) => LiteralKind::Integer(i), TokenKind::Decimal(d) => LiteralKind::Decimal(d), TokenKind::QuotedString(ref s) => LiteralKind::String(s.clone()), ref other => panic!("Unreachable token kind: {:?}", other), }; Ok(Literal { span: next.span, kind: lit_kind }) } } //@ Like the tokenizing module, we're going to need to write lots of tests to //@ check our parser recognises things as we expect them to. Unfortunately the //@ types and syntactic structures used will be slightly different, so we'll //@ use macros to abstract away a lot of the boilerplate. macro_rules! parser_test { ($name:ident, $method:ident, $src:expr => $should_be:expr) => { #[cfg(test)] #[test] fn $name() { // create a codemap and tokenize our input string let mut codemap = $crate::codemap::CodeMap::new(); let filemap = codemap.insert_file("dummy.pas", $src); let tokenized = $crate::lex::tokenize(filemap.contents()) .chain_err(|| "Tokenizing failed") .unwrap(); let tokens = filemap.register_tokens(tokenized); let should_be = $should_be; let mut parser = Parser::new(tokens, filemap); let got = parser.$method().unwrap(); assert_eq!(got, should_be); } } } //@ Now we have our basic test harness set up, lets see if literal parsing works. parser_test!(integer_literal, parse_literal, "123" => LiteralKind::Integer(123)); parser_test!(parse_float_literal, parse_literal, "12.3" => LiteralKind::Decimal(12.3)); // TODO: re-enable this when string parsing is implemented // parser_test!(parse_string_literal, parse_literal, "'12.3'" => LiteralKind::String("12.3".to_string())); //@ Another easy thing to implement is parsing identifiers and dotted identifiers //@ (e.g. `foo.bar.baz`). To recognise a dotted identifier you first look for one //@ identifier, then keep trying to take a pair of dots and idents until there are //@ no more. impl Parser { fn parse_ident(&mut self) -> Result<Ident> { match self.peek() { Some(&TokenKind::Identifier(_)) => {}, _ => bail!("Expected an identifier"), } let next = self.next().unwrap(); if let TokenKind::Identifier(ref ident) = next.kind { Ok(Ident { span: next.span, name: ident.clone(), }) } else { unreachable!() } } fn parse_dotted_ident(&mut self) -> Result<DottedIdent> { let first = self.parse_ident()?; let mut parts = vec![first]; while self.peek() == Some(&TokenKind::Dot) { let _ = self.next(); let next = self.parse_ident()?; parts.push(next); } // the span for a dotted ident should be the union of the spans for // each of its components. let span = parts.iter() .skip(1) .fold(parts[0].span, |l, r| self.filemap.merge(l, r.span)); Ok(DottedIdent { span, parts }) } } //@ Using our `parser_test!()` macro makes these a piece of cake to test. parser_test!(parse_a_basic_ident, parse_ident, "foo" => "foo"); parser_test!(parse_a_dotted_ident, parse_dotted_ident, "foo.bar.baz" => ["foo", "bar", "baz"]); parser_test!(parse_a_single_ident_as_dotted, parse_dotted_ident, "foo" => ["foo"]);