Unmarshal an ISO-8859-1 XML input in Go

后端 未结 6 827
时光说笑
时光说笑 2020-12-09 15:48

When your XML input isn\'t encoded in UTF-8, the Unmarshal function of the xml package seems to require a CharsetReader.

Where do you find

相关标签:
6条回答
  • 2020-12-09 16:15

    There appears to be an external library which handles this: go-charset. I haven't tried it myself; does it work for you?

    0 讨论(0)
  • 2020-12-09 16:17

    Updated answer for 2015 & beyond:

    import (
        "encoding/xml"
        "golang.org/x/net/html/charset"
    )
    reader := bytes.NewReader(theXml)
    decoder := xml.NewDecoder(reader)
    decoder.CharsetReader = charset.NewReaderLabel
    err = decoder.Decode(&parsed)
    
    0 讨论(0)
  • 2020-12-09 16:20

    Expanding on @anschel-schaffer-cohen suggestion and @mjibson's comment, using the go-charset package as mentioned above allows you to use these three lines

    decoder := xml.NewDecoder(reader)
    decoder.CharsetReader = charset.NewReader
    err = decoder.Decode(&parsed)
    

    to achieve the required result. just remember to let charset know where its data files are by calling

    charset.CharsetDir = ".../src/code.google.com/p/go-charset/datafiles"
    

    at some point when the app starts up.

    EDIT

    Instead of the above, charset.CharsetDir = etc. it's more sensible to just import the data files. they are treated as an embedded resource:

    import (
        "code.google.com/p/go-charset/charset"
        _ "code.google.com/p/go-charset/data"
        ...
    )
    

    go install will just do its thing, this also avoids the deployment headache (where/how do I get data files relative to the executing app?).

    using import with an underscore just calls the package's init() func which loads the required stuff into memory.

    0 讨论(0)
  • 2020-12-09 16:26

    Here's a sample Go program which uses a CharsetReader function to convert XML input from ISO-8859-1 to UTF-8. The program prints the test file XML comments.

    package main
    
    import (
        "bytes"
        "fmt"
        "io"
        "os"
        "strings"
        "utf8"
        "xml"
    )
    
    type CharsetISO88591er struct {
        r   io.ByteReader
        buf *bytes.Buffer
    }
    
    func NewCharsetISO88591(r io.Reader) *CharsetISO88591er {
        buf := bytes.NewBuffer(make([]byte, 0, utf8.UTFMax))
        return &CharsetISO88591er{r.(io.ByteReader), buf}
    }
    
    func (cs *CharsetISO88591er) ReadByte() (b byte, err os.Error) {
        // http://unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT
        // Date: 1999 July 27; Last modified: 27-Feb-2001 05:08
        if cs.buf.Len() <= 0 {
            r, err := cs.r.ReadByte()
            if err != nil {
                return 0, err
            }
            if r < utf8.RuneSelf {
                return r, nil
            }
            cs.buf.WriteRune(int(r))
        }
        return cs.buf.ReadByte()
    }
    
    func (cs *CharsetISO88591er) Read(p []byte) (int, os.Error) {
        // Use ReadByte method.
        return 0, os.EINVAL
    }
    
    func isCharset(charset string, names []string) bool {
        charset = strings.ToLower(charset)
        for _, n := range names {
            if charset == strings.ToLower(n) {
                return true
            }
        }
        return false
    }
    
    func IsCharsetISO88591(charset string) bool {
        // http://www.iana.org/assignments/character-sets
        // (last updated 2010-11-04)
        names := []string{
            // Name
            "ISO_8859-1:1987",
            // Alias (preferred MIME name)
            "ISO-8859-1",
            // Aliases
            "iso-ir-100",
            "ISO_8859-1",
            "latin1",
            "l1",
            "IBM819",
            "CP819",
            "csISOLatin1",
        }
        return isCharset(charset, names)
    }
    
    func IsCharsetUTF8(charset string) bool {
        names := []string{
            "UTF-8",
            // Default
            "",
        }
        return isCharset(charset, names)
    }
    
    func CharsetReader(charset string, input io.Reader) (io.Reader, os.Error) {
        switch {
        case IsCharsetUTF8(charset):
            return input, nil
        case IsCharsetISO88591(charset):
            return NewCharsetISO88591(input), nil
        }
        return nil, os.NewError("CharsetReader: unexpected charset: " + charset)
    }
    
    func main() {
        // Print the XML comments from the test file, which should
        // contain most of the printable ISO-8859-1 characters.
        r, err := os.Open("ISO88591.xml")
        if err != nil {
            fmt.Println(err)
            return
        }
        defer r.Close()
        fmt.Println("file:", r.Name())
        p := xml.NewParser(r)
        p.CharsetReader = CharsetReader
        for t, err := p.Token(); t != nil && err == nil; t, err = p.Token() {
            switch t := t.(type) {
            case xml.ProcInst:
                fmt.Println(t.Target, string(t.Inst))
            case xml.Comment:
                fmt.Println(string([]byte(t)))
            }
        }
    }
    

    To unmarshal XML with encoding="ISO-8859-1" from an io.Reader r into a structure result, while using the CharsetReader function from the program to translate from ISO-8859-1 to UTF-8, write:

    p := xml.NewParser(r)
    p.CharsetReader = CharsetReader
    err := p.Unmarshal(&result, nil)
    
    0 讨论(0)
  • 2020-12-09 16:27

    Edit: do not use this, use the go-charset answer.

    Here's an updated version of @peterSO's code that works with go1:

    package main
    
    import (
        "bytes"
        "io"
        "strings"
    )
    
    type CharsetISO88591er struct {
        r   io.ByteReader
        buf *bytes.Buffer
    }
    
    func NewCharsetISO88591(r io.Reader) *CharsetISO88591er {
        buf := bytes.Buffer{}
        return &CharsetISO88591er{r.(io.ByteReader), &buf}
    }
    
    func (cs *CharsetISO88591er) Read(p []byte) (n int, err error) {
        for _ = range p {
            if r, err := cs.r.ReadByte(); err != nil {
                break
            } else {
                cs.buf.WriteRune(rune(r))
            }
        }
        return cs.buf.Read(p)
    }
    
    func isCharset(charset string, names []string) bool {
        charset = strings.ToLower(charset)
        for _, n := range names {
            if charset == strings.ToLower(n) {
                return true
            }
        }
        return false
    }
    
    func IsCharsetISO88591(charset string) bool {
        // http://www.iana.org/assignments/character-sets
        // (last updated 2010-11-04)
        names := []string{
            // Name
            "ISO_8859-1:1987",
            // Alias (preferred MIME name)
            "ISO-8859-1",
            // Aliases
            "iso-ir-100",
            "ISO_8859-1",
            "latin1",
            "l1",
            "IBM819",
            "CP819",
            "csISOLatin1",
        }
        return isCharset(charset, names)
    }
    
    func CharsetReader(charset string, input io.Reader) (io.Reader, error) {
        if IsCharsetISO88591(charset) {
            return NewCharsetISO88591(input), nil
        }
        return input, nil
    }
    

    Called with:

    d := xml.NewDecoder(reader)
    d.CharsetReader = CharsetReader
    err := d.Decode(&dst)
    
    0 讨论(0)
  • 2020-12-09 16:28

    There aren't any provided in the go distribution at the moment, or anywhere else I can find. Not surprising as that hook is less than a month old at the time of writing.

    Since a CharsetReader is defined as CharsetReader func(charset string, input io.Reader) (io.Reader, os.Error), you could make your own. There's one example in the tests, but that might not be exactly useful to you.

    0 讨论(0)
提交回复
热议问题