1
1
mirror of https://github.com/wader/fq.git synced 2024-12-23 13:22:58 +03:00
fq/internal/ioextra/runereader.go

90 lines
1.7 KiB
Go

package ioextra
import (
"io"
"math/bits"
"unicode/utf8"
)
// ByteRuneReader reads each byte as a rune from a io.ReadSeeker
// ex: when used with regexp \u00ff code point will match byte 0xff and not the
// utf-8 encoded version of 0xff
type ByteRuneReader struct {
RS io.ReadSeeker
}
func (brr ByteRuneReader) ReadRune() (r rune, size int, err error) {
var b [1]byte
_, err = io.ReadFull(brr.RS, b[:])
if err != nil {
return 0, 0, err
}
r = rune(b[0])
return r, 1, nil
}
func (brr ByteRuneReader) Seek(offset int64, whence int) (int64, error) {
return brr.RS.Seek(offset, whence)
}
type RuneReadSeeker struct {
RS io.ReadSeeker
}
func utf8Bytes(b byte) int {
c := bits.LeadingZeros8(^b)
// 0b0xxxxxxx 1 byte
// 0b110xxxxx 2 byte
// 0b1110xxxx 3 byte
// 0b11110xxx 4 byte
switch c {
case 0:
return 1
case 2, 3, 4:
return c
default:
return -1
}
}
// RuneReadSeeker reads rune from a io.ReadSeeker
func (brr RuneReadSeeker) ReadRune() (r rune, size int, err error) {
var b [utf8.UTFMax]byte
_, err = io.ReadFull(brr.RS, b[0:1])
if err != nil {
return 0, 0, err
}
c := b[0]
if c < utf8.RuneSelf {
return rune(c), 1, nil
}
ss := utf8Bytes(b[0])
if ss < 0 {
return utf8.RuneError, 1, nil
}
_, err = io.ReadFull(brr.RS, b[1:ss])
if err != nil {
return 0, 0, err
}
r, s := utf8.DecodeRune(b[0:ss])
// possibly rewind if DecodeRune fails as there was a invalid multi byte code point
// TODO: better way that don't require seek back? buffer? one at a time?
d := ss - s
if d > 0 {
if _, err := brr.Seek(int64(-d), io.SeekCurrent); err != nil {
return 0, 0, err
}
}
return r, s, nil
}
func (brr RuneReadSeeker) Seek(offset int64, whence int) (int64, error) {
return brr.RS.Seek(offset, whence)
}