|
| 1 | +//! In this example we build an [S-expression](https://en.wikipedia.org/wiki/S-expression) |
| 2 | +//! parser and tiny [lisp](https://en.wikipedia.org/wiki/Lisp_(programming_language)) interpreter. |
| 3 | +//! Lisp is a simple type of language made up of Atoms and Lists, forming easily parsable trees. |
| 4 | +
|
| 5 | +extern crate jemallocator; |
| 6 | +extern crate nom; |
| 7 | + |
| 8 | +#[global_allocator] |
| 9 | +static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; |
| 10 | + |
| 11 | +use nom::{ |
| 12 | + branch::alt, |
| 13 | + bytes::complete::tag, |
| 14 | + character::complete::{alpha1, char, digit1, multispace0, multispace1, one_of}, |
| 15 | + combinator::{cut, map, map_res, opt}, |
| 16 | + error::{context, VerboseError}, |
| 17 | + multi::many0, |
| 18 | + sequence::{delimited, preceded, terminated, tuple}, |
| 19 | + IResult, |
| 20 | +}; |
| 21 | + |
| 22 | +/// We start by defining the types that define the shape of data that we want. |
| 23 | +/// In this case, we want something tree-like |
| 24 | +
|
| 25 | +/// Starting from the most basic, we define some built-in functions that our lisp has |
| 26 | +#[derive(Debug, PartialEq, Clone, Copy)] |
| 27 | +pub enum BuiltIn { |
| 28 | + Plus, |
| 29 | + Minus, |
| 30 | + Times, |
| 31 | + Divide, |
| 32 | + Equal, |
| 33 | + Not, |
| 34 | +} |
| 35 | + |
| 36 | +/// We now wrap this type and a few other primitives into our Atom type. |
| 37 | +/// Remember from before that Atoms form one half of our language. |
| 38 | +
|
| 39 | +#[derive(Debug, PartialEq, Clone)] |
| 40 | +pub enum Atom { |
| 41 | + Num(i32), |
| 42 | + Keyword(String), |
| 43 | + Boolean(bool), |
| 44 | + BuiltIn(BuiltIn), |
| 45 | +} |
| 46 | + |
| 47 | +/// The remaining half is Lists. We implement these as recursive Expressions. |
| 48 | +/// For a list of numbers, we have `'(1 2 3)`, which we'll parse to: |
| 49 | +/// ``` |
| 50 | +/// Expr::Quote(vec![Expr::Constant(Atom::Num(1)), |
| 51 | +/// Expr::Constant(Atom::Num(2)), |
| 52 | +/// Expr::Constant(Atom::Num(3))]) |
| 53 | +/// Quote takes an S-expression and prevents evaluation of it, making it a data |
| 54 | +/// structure that we can deal with programmatically. Thus any valid expression |
| 55 | +/// is also a valid data structure in Lisp itself. |
| 56 | +
|
| 57 | +#[derive(Debug, PartialEq, Clone)] |
| 58 | +pub enum Expr { |
| 59 | + Constant(Atom), |
| 60 | + /// (func-name arg1 arg2) |
| 61 | + Application(Box<Expr>, Vec<Expr>), |
| 62 | + /// (if predicate do-this) |
| 63 | + If(Box<Expr>, Box<Expr>), |
| 64 | + /// (if predicate do-this otherwise-do-this) |
| 65 | + IfElse(Box<Expr>, Box<Expr>, Box<Expr>), |
| 66 | + /// '(3 (if (+ 3 3) 4 5) 7) |
| 67 | + Quote(Vec<Expr>), |
| 68 | +} |
| 69 | + |
| 70 | +/// Continuing the trend of starting from the simplest piece and building up, |
| 71 | +/// we start by creating a parser for the built-in operator functions. |
| 72 | +fn parse_builtin_op<'a>(i: &'a str) -> IResult<&'a str, BuiltIn, VerboseError<&'a str>> { |
| 73 | + // one_of matches one of the characters we give it |
| 74 | + let (i, t) = one_of("+-*/=")(i)?; |
| 75 | + |
| 76 | + // because we are matching single character tokens, we can do the matching logic |
| 77 | + // on the returned value |
| 78 | + Ok(( |
| 79 | + i, |
| 80 | + match t { |
| 81 | + '+' => BuiltIn::Plus, |
| 82 | + '-' => BuiltIn::Minus, |
| 83 | + '*' => BuiltIn::Times, |
| 84 | + '/' => BuiltIn::Divide, |
| 85 | + '=' => BuiltIn::Equal, |
| 86 | + _ => unreachable!(), |
| 87 | + }, |
| 88 | + )) |
| 89 | +} |
| 90 | + |
| 91 | +fn parse_builtin<'a>(i: &'a str) -> IResult<&'a str, BuiltIn, VerboseError<&'a str>> { |
| 92 | + // alt gives us the result of first parser that succeeds, of the series of |
| 93 | + // parsers we give it |
| 94 | + alt(( |
| 95 | + parse_builtin_op, |
| 96 | + // map lets us process the parsed output, in this case we know what we parsed, |
| 97 | + // so we ignore the input and return the BuiltIn directly |
| 98 | + map(tag("not"), |_| BuiltIn::Not), |
| 99 | + ))(i) |
| 100 | +} |
| 101 | + |
| 102 | +/// Our boolean values are also constant, so we can do it the same way |
| 103 | +fn parse_bool<'a>(i: &'a str) -> IResult<&'a str, Atom, VerboseError<&'a str>> { |
| 104 | + alt((map(tag("#t"), |_| Atom::Boolean(true)), map(tag("#f"), |_| Atom::Boolean(false))))(i) |
| 105 | +} |
| 106 | + |
| 107 | +/// The next easiest thing to parse are keywords. |
| 108 | +/// We introduce some error handling combinators: `context` for human readable errors |
| 109 | +/// and `cut` to prevent back-tracking. |
| 110 | +/// |
| 111 | +/// Put plainly: `preceded(tag(":"), cut(alpha1))` means that once we see the `:` |
| 112 | +/// character, we have to see one or more alphabetic chararcters or the input is invalid. |
| 113 | +fn parse_keyword<'a>(i: &'a str) -> IResult<&'a str, Atom, VerboseError<&'a str>> { |
| 114 | + map(context("keyword", preceded(tag(":"), cut(alpha1))), |sym_str: &str| { |
| 115 | + Atom::Keyword(sym_str.to_string()) |
| 116 | + })(i) |
| 117 | +} |
| 118 | + |
| 119 | +/// Next up is number parsing. We're keeping it simple here by accepting any number (> 1) |
| 120 | +/// of digits but ending the program if it doesn't fit into an i32. |
| 121 | +fn parse_num<'a>(i: &'a str) -> IResult<&'a str, Atom, VerboseError<&'a str>> { |
| 122 | + alt(( |
| 123 | + map_res(digit1, |digit_str: &str| digit_str.parse::<i32>().map(Atom::Num)), |
| 124 | + map(preceded(tag("-"), digit1), |digit_str: &str| { |
| 125 | + Atom::Num(-1 * digit_str.parse::<i32>().unwrap()) |
| 126 | + }), |
| 127 | + ))(i) |
| 128 | +} |
| 129 | + |
| 130 | +/// Now we take all these simple parsers and connect them. |
| 131 | +/// We can now parse half of our language! |
| 132 | +fn parse_atom<'a>(i: &'a str) -> IResult<&'a str, Atom, VerboseError<&'a str>> { |
| 133 | + alt((parse_num, parse_bool, map(parse_builtin, Atom::BuiltIn), parse_keyword))(i) |
| 134 | +} |
| 135 | + |
| 136 | +/// We then add the Expr layer on top |
| 137 | +fn parse_constant<'a>(i: &'a str) -> IResult<&'a str, Expr, VerboseError<&'a str>> { |
| 138 | + map(parse_atom, |atom| Expr::Constant(atom))(i) |
| 139 | +} |
| 140 | + |
| 141 | +/// Before continuing, we need a helper function to parse lists. |
| 142 | +/// A list starts with `(` and ends with a matching `)`. |
| 143 | +/// By putting whitespace and newline parsing here, we can avoid having to worry about it |
| 144 | +/// in much of the rest of the parser. |
| 145 | +/// |
| 146 | +/// Unlike the previous functions, this function doesn't take or consume input, instead it |
| 147 | +/// takes a parsing function and returns a new parsing function. |
| 148 | +fn s_exp<'a, O1, F>(inner: F) -> impl Fn(&'a str) -> IResult<&'a str, O1, VerboseError<&'a str>> |
| 149 | +where |
| 150 | + F: Fn(&'a str) -> IResult<&'a str, O1, VerboseError<&'a str>>, |
| 151 | +{ |
| 152 | + delimited( |
| 153 | + char('('), |
| 154 | + preceded(multispace0, inner), |
| 155 | + context("closing paren", cut(preceded(multispace0, char(')')))), |
| 156 | + ) |
| 157 | +} |
| 158 | + |
| 159 | +/// We can now use our new combinator to define the rest of the `Expr`s. |
| 160 | +/// |
| 161 | +/// Starting with function application, we can see how the parser mirrors our data |
| 162 | +/// definitions: our definition is `Application(Box<Expr>, Vec<Expr>)`, so we know |
| 163 | +/// that we need to parse an expression and then parse 0 or more expressions, all |
| 164 | +/// wrapped in an S-expression. |
| 165 | +/// |
| 166 | +/// `tuple` is used to sequence parsers together, so we can translate this directly |
| 167 | +/// and then map over it to transform the output into an `Expr::Application` |
| 168 | +fn parse_application<'a>(i: &'a str) -> IResult<&'a str, Expr, VerboseError<&'a str>> { |
| 169 | + let application_inner = map(tuple((parse_expr, many0(parse_expr))), |(head, tail)| { |
| 170 | + Expr::Application(Box::new(head), tail) |
| 171 | + }); |
| 172 | + // finally, we wrap it in an s-expression |
| 173 | + s_exp(application_inner)(i) |
| 174 | +} |
| 175 | + |
| 176 | +/// Because `Expr::If` and `Expr::IfElse` are so similar (we easily could have |
| 177 | +/// defined `Expr::If` to have an `Option` for the else block), we parse both |
| 178 | +/// in a single function. |
| 179 | +/// |
| 180 | +/// In fact, we define our parser as if `Expr::If` was defined with an Option in it, |
| 181 | +/// we have the `opt` combinator which fits very nicely here. |
| 182 | +fn parse_if<'a>(i: &'a str) -> IResult<&'a str, Expr, VerboseError<&'a str>> { |
| 183 | + let if_inner = context( |
| 184 | + "if expression", |
| 185 | + map( |
| 186 | + preceded( |
| 187 | + // here to avoid ambiguity with other names starting with `if`, if we added |
| 188 | + // variables to our language, we say that if must be terminated by at least |
| 189 | + // one whitespace character |
| 190 | + terminated(tag("if"), multispace1), |
| 191 | + cut(tuple((parse_expr, parse_expr, opt(parse_expr)))), |
| 192 | + ), |
| 193 | + |(pred, true_branch, maybe_false_branch)| { |
| 194 | + if let Some(false_branch) = maybe_false_branch { |
| 195 | + Expr::IfElse(Box::new(pred), Box::new(true_branch), Box::new(false_branch)) |
| 196 | + } else { |
| 197 | + Expr::If(Box::new(pred), Box::new(true_branch)) |
| 198 | + } |
| 199 | + }, |
| 200 | + ), |
| 201 | + ); |
| 202 | + s_exp(if_inner)(i) |
| 203 | +} |
| 204 | + |
| 205 | +/// A quoted S-expression is list data structure. |
| 206 | +/// |
| 207 | +/// This example doesn't have the symbol atom, but by adding variables and changing |
| 208 | +/// the definition of quote to not always be around an S-expression, we'd get them |
| 209 | +/// naturally. |
| 210 | +fn parse_quote<'a>(i: &'a str) -> IResult<&'a str, Expr, VerboseError<&'a str>> { |
| 211 | + // this should look very straight-forward after all we've done: |
| 212 | + // we find the `'` (quote) character, use cut to say that we're unambiguously |
| 213 | + // looking for an s-expression of 0 or more expressions, and then parse them |
| 214 | + map(context("quote", preceded(tag("'"), cut(s_exp(many0(parse_expr))))), |exprs| { |
| 215 | + Expr::Quote(exprs) |
| 216 | + })(i) |
| 217 | +} |
| 218 | + |
| 219 | +/// We tie them all together again, making a top-level expression parser! |
| 220 | +
|
| 221 | +fn parse_expr<'a>(i: &'a str) -> IResult<&'a str, Expr, VerboseError<&'a str>> { |
| 222 | + preceded(multispace0, alt((parse_constant, parse_application, parse_if, parse_quote)))(i) |
| 223 | +} |
| 224 | + |
| 225 | +/// And that's it! |
| 226 | +/// We can now parse our entire lisp language. |
| 227 | +/// |
| 228 | +/// But in order to make it a little more interesting, we can hack together |
| 229 | +/// a little interpreter to take an Expr, which is really an |
| 230 | +/// [Abstract Syntax Tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree) (AST), |
| 231 | +/// and give us something back |
| 232 | +
|
| 233 | +/// To start we define a couple of helper functions |
| 234 | +fn get_num_from_expr(e: Expr) -> Option<i32> { |
| 235 | + if let Expr::Constant(Atom::Num(n)) = e { |
| 236 | + Some(n) |
| 237 | + } else { |
| 238 | + None |
| 239 | + } |
| 240 | +} |
| 241 | + |
| 242 | +fn get_bool_from_expr(e: Expr) -> Option<bool> { |
| 243 | + if let Expr::Constant(Atom::Boolean(b)) = e { |
| 244 | + Some(b) |
| 245 | + } else { |
| 246 | + None |
| 247 | + } |
| 248 | +} |
| 249 | + |
| 250 | +/// This function tries to reduce the AST. |
| 251 | +/// This has to return an Expression rather than an Atom because quoted s_expressions |
| 252 | +/// can't be reduced |
| 253 | +fn eval_expression(e: Expr) -> Option<Expr> { |
| 254 | + match e { |
| 255 | + // Constants and quoted s-expressions are our base-case |
| 256 | + Expr::Constant(_) | Expr::Quote(_) => Some(e), |
| 257 | + // we then recursively `eval_expression` in the context of our special forms |
| 258 | + // and built-in operators |
| 259 | + Expr::If(pred, true_branch) => { |
| 260 | + let reduce_pred = eval_expression(*pred)?; |
| 261 | + if get_bool_from_expr(reduce_pred)? { |
| 262 | + eval_expression(*true_branch) |
| 263 | + } else { |
| 264 | + None |
| 265 | + } |
| 266 | + } |
| 267 | + Expr::IfElse(pred, true_branch, false_branch) => { |
| 268 | + let reduce_pred = eval_expression(*pred)?; |
| 269 | + if get_bool_from_expr(reduce_pred)? { |
| 270 | + eval_expression(*true_branch) |
| 271 | + } else { |
| 272 | + eval_expression(*false_branch) |
| 273 | + } |
| 274 | + } |
| 275 | + Expr::Application(head, tail) => { |
| 276 | + let reduced_head = eval_expression(*head)?; |
| 277 | + let reduced_tail = tail.into_iter().map(|expr| eval_expression(expr)).collect::<Option<Vec<Expr>>>()?; |
| 278 | + if let Expr::Constant(Atom::BuiltIn(bi)) = reduced_head { |
| 279 | + Some(Expr::Constant(match bi { |
| 280 | + BuiltIn::Plus => Atom::Num( |
| 281 | + reduced_tail |
| 282 | + .into_iter() |
| 283 | + .map(get_num_from_expr) |
| 284 | + .collect::<Option<Vec<i32>>>()? |
| 285 | + .into_iter() |
| 286 | + .sum(), |
| 287 | + ), |
| 288 | + BuiltIn::Times => Atom::Num( |
| 289 | + reduced_tail |
| 290 | + .into_iter() |
| 291 | + .map(get_num_from_expr) |
| 292 | + .collect::<Option<Vec<i32>>>()? |
| 293 | + .into_iter() |
| 294 | + .product(), |
| 295 | + ), |
| 296 | + BuiltIn::Equal => Atom::Boolean(reduced_tail.iter().zip(reduced_tail.iter().skip(1)).all(|(a, b)| a == b)), |
| 297 | + BuiltIn::Not => { |
| 298 | + if reduced_tail.len() != 1 { |
| 299 | + return None; |
| 300 | + } else { |
| 301 | + Atom::Boolean(!get_bool_from_expr(reduced_tail.first().cloned().unwrap())?) |
| 302 | + } |
| 303 | + } |
| 304 | + BuiltIn::Minus => Atom::Num(if let Some(first_elem) = reduced_tail.first().cloned() { |
| 305 | + let fe = get_num_from_expr(first_elem)?; |
| 306 | + reduced_tail |
| 307 | + .into_iter() |
| 308 | + .map(get_num_from_expr) |
| 309 | + .collect::<Option<Vec<i32>>>()? |
| 310 | + .into_iter() |
| 311 | + .skip(1) |
| 312 | + .fold(fe, |a, b| a - b) |
| 313 | + } else { |
| 314 | + Default::default() |
| 315 | + }), |
| 316 | + BuiltIn::Divide => Atom::Num(if let Some(first_elem) = reduced_tail.first().cloned() { |
| 317 | + let fe = get_num_from_expr(first_elem)?; |
| 318 | + reduced_tail |
| 319 | + .into_iter() |
| 320 | + .map(get_num_from_expr) |
| 321 | + .collect::<Option<Vec<i32>>>()? |
| 322 | + .into_iter() |
| 323 | + .skip(1) |
| 324 | + .fold(fe, |a, b| a / b) |
| 325 | + } else { |
| 326 | + Default::default() |
| 327 | + }), |
| 328 | + })) |
| 329 | + } else { |
| 330 | + None |
| 331 | + } |
| 332 | + } |
| 333 | + } |
| 334 | +} |
| 335 | + |
| 336 | +/// And we add one more top-level function to tie everything together, letting |
| 337 | +/// us call eval on a string directly |
| 338 | +fn eval_from_str(src: &str) -> Result<Expr, String> { |
| 339 | + parse_expr(src) |
| 340 | + .map_err(|e: nom::Err<VerboseError<&str>>| format!("{:#?}", e)) |
| 341 | + .and_then(|(_, exp)| eval_expression(exp).ok_or("Eval failed".to_string())) |
| 342 | +} |
| 343 | + |
| 344 | +fn main() { |
| 345 | + let expression_1 = "((if (= (+ 3 (/ 9 3)) |
| 346 | + (* 2 3)) |
| 347 | + * |
| 348 | + /) |
| 349 | + 456 123)"; |
| 350 | + println!("\"{}\"\nevaled gives us: {:?}", expression_1, eval_from_str(expression_1)); |
| 351 | +} |
| 352 | + |
0 commit comments