F# How to tokenise user input: separating numbers, units, words?

。_饼干妹妹 提交于 2019-12-04 12:38:04

I put together an example using the FParsec library. The example is not robust at all but it gives a pretty good picture of how to use FParsec.

type Element =
| Word of string
| Number of string
| Operator of string
| CurrencyCode of string
| PerCent  of string    

let parsePerCent state =
    (parse {
        let! r = pstring "%"
        return PerCent r
    }) state

let currencyCodes = [|
    pstring "EUR"
|]

let parseCurrencyCode state =
    (parse {
        let! r = choice currencyCodes
        return CurrencyCode r
    }) state

let operators = [|
    pstring "at"
    pstring "/"
|]

let parseOperator state =
    (parse {
        let! r = choice operators
        return Operator r
    }) state

let parseNumber state =
    (parse {
        let! e1 = many1Chars digit
        let! r = opt (pchar '.')
        let! e2 = manyChars digit
        return Number (e1 + (if r.IsSome then "." else "") + e2)
    }) state

let parseWord state =
    (parse {
        let! r = many1Chars (letter <|> pchar ':')
        return Word r
    }) state

let elements = [| 
    parseOperator
    parseCurrencyCode
    parseWord
    parseNumber 
    parsePerCent
|]

let parseElement state =
    (parse {
        do! spaces
        let! r = choice elements
        do! spaces
        return r
    }) state

let parseElements state =
    manyTill parseElement eof state

let parse (input:string) =
    let result = run parseElements input 
    match result with
    | Success (v, _, _) -> v
    | Failure (m, _, _) -> failwith m

It sounds like what you really want is just a lexer. A good alternative to FSParsec would be FSLex. (Good intro tutorial, albiet somewhat dated, can be found on my old blog here.) Using FSLex you can take your input text:

XYZ Hotel: 6 nights at 220EUR / night plus 17.5% tax

And get it properly tokenized into something like:

 [ Word("XYZ"); Hotel; Int(6); Word("nights"); Word("at"); Int(220); EUR; ... ]

The next step, once you have an List of tokens, is to do some form of pattern matching / analysis to extract semantic information (which I assume is what you are really after). With the normalized token stream, it should be as simple as:

let rec processTokenList tokens = 
    match tokens with
    | Float(x) :: Keyword("EUR") :: rest  -> // Dollar amount x
    | Word(x) :: Keyword("Hotel") :: rest -> // Hotel x
    | hd :: rest -> // Couldn't find anything interesting...
                    processTokenList rest

That should at least get you started. But note that as your input gets more 'formal', so will the usefulness of your lexing. (And if you only accept a very specific input, then you can use a proper parser and be done with it!)

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!