// Package utfutil provides methods that make it easy to read data in an UTF-encoding agnostic. package utfutil // These functions autodetect UTF BOM and return UTF-8. If no // BOM is found, a hint is provided as to which encoding to assume. // You can use them as replacements for os.Open() and ioutil.ReadFile() // when the encoding of the file is unknown. // utfutil.OpenFile() is a replacement for os.Open(). // utfutil.ReadFile() is a replacement for ioutil.ReadFile(). // utfutil.NewScanner() takes a filename and returns a Scanner. // utfutil.NewReader() rewraps an existing scanner to make it UTF-encoding agnostic. // utfutil.BytesReader() takes a []byte and decodes it to UTF-8. // Since it is impossible to guess 100% correctly if there is no BOM, // the functions take a 2nd parameter of type "EncodingHint" where you // specify the default encoding for BOM-less data. // If someone writes a golang equivalent of uchatdet, I'll add // a hint called "AUTO" which uses it. // Inspiration: I wrote this after spending half a day trying // to figure out how to use unicode.BOMOverride. // Hopefully this will save other golang newbies from the same. // (golang.org/x/text/encoding/unicode) import ( "bufio" "bytes" "io" "io/ioutil" "os" "golang.org/x/text/encoding" "golang.org/x/text/encoding/unicode" "golang.org/x/text/transform" ) // EncodingHint indicates the file's encoding if there is no BOM. type EncodingHint int const ( UTF8 EncodingHint = iota // UTF-8 UTF16LE // UTF 16 Little Endian UTF16BE // UTF 16 Big Endian WINDOWS = UTF16LE // File came from a MS-Windows system POSIX = UTF8 // File came from Unix or Unix-like systems HTML5 = UTF8 // File came from the web ) // About utfutil.HTML5: // This technique is recommended by the W3C for use in HTML 5: // "For compatibility with deployed content, the byte order // mark (also known as BOM) is considered more authoritative // than anything else." http://www.w3.org/TR/encoding/#specification-hooks // OpenFile is the equivalent of os.Open(). func OpenFile(name string, d EncodingHint) (io.Reader, error) { f, err := os.Open(name) if err != nil { return nil, err } return NewReader(f, d), nil } // ReadFile is the equivalent of ioutil.ReadFile() func ReadFile(name string, d EncodingHint) ([]byte, error) { file, err := OpenFile(name, d) if err != nil { return nil, err } return ioutil.ReadAll(file) } // NewScanner is a convenience function that takes a filename and returns a scanner. func NewScanner(name string, d EncodingHint) (*bufio.Scanner, error) { f, err := OpenFile(name, d) if err != nil { return nil, err } return bufio.NewScanner(f), nil } // NewReader wraps a Reader to decode Unicode to UTF-8 as it reads. func NewReader(r io.Reader, d EncodingHint) io.Reader { var decoder *encoding.Decoder switch d { case UTF8: // Make a transformer that assumes UTF-8 but abides by the BOM. decoder = unicode.UTF8.NewDecoder() case UTF16LE: // Make an tranformer that decodes MS-Windows (16LE) UTF files: winutf := unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM) // Make a transformer that is like winutf, but abides by BOM if found: decoder = winutf.NewDecoder() case UTF16BE: // Make an tranformer that decodes UTF-16BE files: utf16be := unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM) // Make a transformer that is like utf16be, but abides by BOM if found: decoder = utf16be.NewDecoder() } // Make a Reader that uses utf16bom: return transform.NewReader(r, unicode.BOMOverride(decoder)) } // BytesReader is a convenience function that takes a []byte and decodes them to UTF-8. func BytesReader(b []byte, d EncodingHint) io.Reader { return NewReader(bytes.NewReader(b), d) }