sq/libsq/core/diffdoc/internal/go-udiff/ndiff.go
Neil O'Toole a3cd01f36a
#353 Diff performance (#399)
* Diff refactor
2024-02-20 16:26:45 -07:00

110 lines
2.7 KiB
Go

// Copyright 2022 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package udiff
import (
"bytes"
"unicode/utf8"
"github.com/neilotoole/sq/libsq/core/diffdoc/internal/go-udiff/lcs"
)
// Strings computes the differences between two strings.
// The resulting edits respect rune boundaries.
func Strings(before, after string) []Edit {
if before == after {
return nil // common case
}
if stringIsASCII(before) && stringIsASCII(after) {
// TODO(adonovan): opt: specialize diffASCII for strings.
return diffASCII([]byte(before), []byte(after))
}
return diffRunes([]rune(before), []rune(after))
}
// Bytes computes the differences between two byte slices.
// The resulting edits respect rune boundaries.
func Bytes(before, after []byte) []Edit {
if bytes.Equal(before, after) {
return nil // common case
}
if bytesIsASCII(before) && bytesIsASCII(after) {
return diffASCII(before, after)
}
return diffRunes(runes(before), runes(after))
}
func diffASCII(before, after []byte) []Edit {
diffs := lcs.DiffBytes(before, after)
// Convert from LCS diffs.
res := make([]Edit, len(diffs))
for i, d := range diffs {
res[i] = Edit{Start: d.Start, End: d.End, New: string(after[d.ReplStart:d.ReplEnd])}
}
return res
}
func diffRunes(before, after []rune) []Edit {
diffs := lcs.DiffRunes(before, after)
// The diffs returned by the lcs package use indexes
// into whatever slice was passed in.
// Convert rune offsets to byte offsets.
res := make([]Edit, len(diffs))
lastEnd := 0
utf8Len := 0
for i, d := range diffs {
utf8Len += runesLen(before[lastEnd:d.Start]) // text between edits
start := utf8Len
utf8Len += runesLen(before[d.Start:d.End]) // text deleted by this edit
res[i] = Edit{Start: start, End: utf8Len, New: string(after[d.ReplStart:d.ReplEnd])}
lastEnd = d.End
}
return res
}
// runes is like []rune(string(bytes)) without the duplicate allocation.
func runes(b []byte) []rune {
n := utf8.RuneCount(b)
rs := make([]rune, n)
for i := 0; i < n; i++ {
r, sz := utf8.DecodeRune(b)
b = b[sz:]
rs[i] = r
}
return rs
}
// runesLen returns the length in bytes of the UTF-8 encoding of runes.
func runesLen(runes []rune) (length int) {
for _, r := range runes {
length += utf8.RuneLen(r)
}
return length
}
// stringIsASCII reports whether s contains only ASCII.
// TODO(adonovan): combine when x/tools allows generics.
func stringIsASCII(s string) bool {
for i := 0; i < len(s); i++ {
if s[i] >= utf8.RuneSelf {
return false
}
}
return true
}
func bytesIsASCII(s []byte) bool {
for i := 0; i < len(s); i++ {
if s[i] >= utf8.RuneSelf {
return false
}
}
return true
}