So lately I have been working on writing a simple compiler to better understand compiler concepts. Being a diligent reader of stackoverfolow, it seems there is a consensus that
One strategy for functional parsing is monadic parser combinators. You can read some about it here (and follow links) or use a library like FParsec. I do not recommend this approach if you're just learning/starting F#/compilers, though.
Another approach with F# is to use FsLex/FsYacc (in the PowerPack). I kinda loathe Lex/Yacc technology, so I also don't recommend this.
I think you should write a recursive decent parser by hand. I don't have strong feelings regarding a tokenizer, but simply tokeninize the entire file into a(n immutable) list of tokens and then doing recursive descent (and leveraging some pattern-matching) is a good way to to deal with parsing. And of course, you'll want to use discrimated unions to represent the AST output of the parser (a la here).
I haven't read the dragon book in a long time, but I'm apparently the only person on the planet who doesn't like it. I would consider abandoning that text in favor of a book that discusses compilers using some ML-based language, though I can't recommend one offhand.
EDIT
I haven't done one of these in a while, so I took a few minutes to code a small sample.
// AST for tiny language
type Op =
| Plus
| Minus
type Expr =
| Literal of int
| BinaryOp of Expr * Op * Expr // left, op, right
type Stmt =
| IfThenElse of Expr * Stmt * Stmt // cond, then, else; 0=false in cond
| Print of Expr
// sample program
let input = @"
if 1+1-1 then
print 42
else
print 0"
// expected AST
let goal =
IfThenElse(
BinaryOp( BinaryOp(Literal(1),Plus,Literal(1)), Minus, Literal(1)),
Print(Literal(42)),
Print(Literal(0)))
////////////////////////////////////////////////////////////////////////////
// Lexer
type Token =
| IF
| THEN
| ELSE
| PRINT
| NUM of int // non-negative
| PLUS
| MINUS
| EOF
let makeTokenizer (s:string) =
let i = ref 0
let keywords = [
"if", IF
"then", THEN
"else", ELSE
"print", PRINT
"+", PLUS
"-", MINUS ]
let rec getNextToken() =
if !i >= s.Length then
EOF
elif System.Char.IsWhiteSpace(s.[!i]) then
incr i
getNextToken()
elif System.Char.IsDigit(s.[!i]) then
let mutable j = !i
while j < s.Length && System.Char.IsDigit(s.[j]) do
j <- j + 1
let numStr = s.Substring(!i, j - !i)
i := j
NUM(System.Int32.Parse(numStr)) // may throw, e.g. if > MAXINT
else
let keyword = keywords |> List.tryPick (fun (kwStr,kwTok) ->
if s.IndexOf(kwStr, !i) = !i then
i := !i + kwStr.Length
Some(kwTok)
else
None)
match keyword with
| Some k -> k
| None ->
failwith "unexpected char '%c' at position %d" s.[!i] !i
getNextToken
let tokens =
let nextToken = makeTokenizer input
let t = ref(nextToken())
[
yield !t
while !t <> EOF do
t := nextToken()
yield !t
]
printfn "%A" tokens // sanity check our tokenizer works
/////////////////////////////////////////////////////////////////////////
// Parser
let parseExpr toks =
match toks with
| NUM x :: rest ->
let mutable rest = rest
let mutable expr = Literal x
while rest.Head = PLUS || rest.Head = MINUS do
let op,y,r =
match rest with
| PLUS::NUM y::t -> Plus, Literal y, t
| MINUS::NUM y::t -> Minus, Literal y, t
| _ ->
failwith "parse error in expression, expected number"
expr <- BinaryOp(expr, op, y)
rest <- r
expr, rest
| _ -> failwith "parse error in expression, expected number"
let rec parseStmt toks =
match toks with
| PRINT :: rest ->
let e,rest = parseExpr(rest)
Print(e), rest
| IF :: rest ->
let e,rest = parseExpr(rest)
match rest with
| THEN :: rest ->
let s1,rest = parseStmt(rest)
match rest with
| ELSE :: rest ->
let s2,rest = parseStmt(rest)
IfThenElse(e,s1,s2), rest
| _ ->
failwith "parse error after if branch, espected 'else'"
| _ ->
failwith "parse error after if expression, expected 'then'"
| _ -> failwith "parse error, expected statement"
let parseProgram toks =
let s,rest = parseStmt toks
match rest with
| [EOF] -> s
| _ -> failwith "parse error after statement, expected EOF"
let p = parseProgram tokens
printfn "%A" p
assert( p = goal )
(Hopefully there are no egregious bugs.)