type token = Plus | Dash | Star | Slash | LParen | RParen | Int of string
(* string -> token list
"+ 1 2" -> [Plus; Int("1"); Int("2")]
*)
let lexer program =
let mkcompile s = Re.compile (Re.Perl.re s) in
let plus_re = mkcompile "^\\+" in
let mult_re = mkcompile "^\\*" in
let sub_re = mkcompile "^-" in
let div_re = mkcompile "^\\/" in
let lp_re = mkcompile "^\\(" in
let rp_re = mkcompile "^\\)" in
let num_re = mkcompile "^-?\\d+" in
let ws_re = mkcompile " " in
let rec lexer program =
if Re.execp plus_re program then
Plus::(lexer (String.sub program 1 ((String.len program) - 1)))
else if Re.execp mult_re program then
Star::(lexer (String.sub program 1 ((String.len program) - 1)))
else if Re.execp num_re program then
let num = Re.Group.get (Re.exec num_re program) 0 in
let num_len = String.len num in
Int(num)::(lexer (String.sub program num_len ((String.len program) - num_len)))
else if Re.execp sub_re program then
Dash::(lexer (String.sub program 1 ((String.len program) - 1)))
else if Re.execp div_re program then
Slash::(lexer (String.sub program 1 ((String.len program) - 1)))
else if Re.execp lp_re program then
LParen::(lexer (String.sub program 1 ((String.len program) - 1)))
else if Re.execp rp_re program then
RParen::(lexer (String.sub program 1 ((String.len program) - 1)))
else if Re.exccp ws_re program then
lexer (String.sub program 1 ((String.len program) - 1))
else
failwith "word not valid"
in lexer programBuilding tree
type ast = Value of String | Add of ast * ast | Minus of ast * ast | Mult of ast * ast | Div of ast * ast;;
(* type parse_tree = Value of string | Add of "+" * parse_tree * parse_tree | ... | Paren of "(" * parse_tree * ")" | ...;; *)
(* token list -> ast *)
(* We will make an LL(k) parser, cannot deal with ambiguity, they are recursive descent parsers
LL(k) says left associative, lookahead by k tokens *)
let parser tokens =
(* token list -> ast, token list *)
let parse_e tokens = match tokens with
(* E trees start with Plus *)
|Plus::toks -> (* the first thing after is T *) let ttree, tremain = parse_t toks in
(* then the first thing after the T is an E *)
let etree, eremain = parse_e tremain in
Add(ttree, etree), eremain
(* Or, E trees start with Dash *)
|Dash::toks -> let ttree, tremain = parse_t toks in
let etree, eremain = parse_e tremain in
Minus(ttree, etree), eremain
(* if it's none of these things, it HAS to be a T-tree by definition of our E tree production rule *)
|_ -> parse_t tokens (*ttree, tremain*)
and
parse_t tokens = let ntree, nremain = parse_n tokens in
match nremain with
|Star::toks -> let ttree, tremain = parse_t toks in
Mult(ntree, ttree), tremain
|Slash::toks -> let ttree, tremain = parse_t toks in
Div(ntree, ttree), tremain
(* if it's none of these things, it HAS to be a N-tree by definition of our N tree production rule *)
|_ -> ntree, nremain
and parse_n tokens = match tokens with
|Int(x)::toks -> Value(x), toks
|LParen::toks -> let etree, eremain = parse_e toks in
match eremain with
RParen -> etree, eremain
|_ -> failwith "unbalanced parenthesis"
|_ -> failwith "grammar is wrong"
in
let etree, eremain = parse_e tokens in
match eremain with
[] -> etree
|_ -> failwith "leftover tokens"