sq/libsq/core/ioz/checksum/checksum.go
Neil O'Toole cf0908abfa
JSON ingest performance (#365)
* Implement json ingestion stmt caching
2024-01-25 09:21:56 -07:00

223 lines
6.2 KiB
Go

// Package checksum provides functions for working with checksums.
// It uses crc32 for the checksum algorithm, resulting in checksum
// values like "3af3aaad".
package checksum
import (
"bufio"
"bytes"
"crypto/rand"
"fmt"
"hash/crc32"
"io"
"net/http"
"os"
"strconv"
"strings"
"github.com/neilotoole/sq/libsq/core/errz"
"github.com/neilotoole/sq/libsq/core/ioz"
)
// Sum returns the hash of b as a hex string.
// If b is empty, empty string is returned.
func Sum(b []byte) string {
if len(b) == 0 {
return ""
}
sum := crc32.ChecksumIEEE(b)
return fmt.Sprintf("%x", sum)
}
// SumAll returns the hash of a, and all the elements
// of b, as a hex string.
func SumAll[T ~string](a T, b ...T) string {
h := crc32.NewIEEE()
_, _ = h.Write([]byte(a))
for _, col := range b {
_, _ = h.Write([]byte(col))
}
return fmt.Sprintf("%x", h.Sum32())
}
// Rand returns a random checksum.
func Rand() string {
b := make([]byte, 128)
_, _ = rand.Read(b)
return Sum(b)
}
// Checksum is a checksum value.
type Checksum string
// Write appends a checksum line to w, including
// a newline. The typical format is:
//
// <sum> <name>
// 3610a686 file.txt
//
// However, the checksum be any string value. Use ForFile to calculate
// a checksum, and Read to read this format.
func Write(w io.Writer, sum Checksum, name string) error {
_, err := fmt.Fprintf(w, "%s %s\n", sum, name)
return errz.Err(err)
}
// WriteFile writes a single {checksum,name} to path, overwriting
// the previous contents.
//
// See: Write.
func WriteFile(path string, sum Checksum, name string) error {
f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, ioz.RWPerms)
if err != nil {
return errz.Wrap(err, "write checksum file")
}
err = Write(f, sum, name)
if err == nil {
return errz.Err(f.Close())
}
_ = f.Close()
return err
}
// ReadFile reads a checksum file from path.
//
// See Read for details.
func ReadFile(path string) (map[string]Checksum, error) {
f, err := os.Open(path)
if err != nil {
return nil, errz.Err(err)
}
defer func() { _ = f.Close() }()
return Read(f)
}
// Read reads checksums lines from r, returning a map
// of checksums keyed by name. Empty lines, and lines beginning
// with "#" (comments) are ignored. This function is the
// inverse of Write.
func Read(r io.Reader) (map[string]Checksum, error) {
sums := map[string]Checksum{}
sc := bufio.NewScanner(r)
for sc.Scan() {
line := strings.TrimSpace(sc.Text())
if line == "" || strings.HasPrefix(line, "#") {
continue
}
parts := strings.SplitN(line, " ", 2)
if len(parts) != 2 {
return nil, errz.Errorf("invalid checksum line: %q", line)
}
sums[parts[1]] = Checksum(parts[0])
}
return sums, errz.Wrap(sc.Err(), "read checksums")
}
// ForFile returns a checksum of the file at path.
// The checksum is based on the file's name, size, mode, and
// modification time. File contents are not read.
func ForFile(path string) (Checksum, error) {
fi, err := os.Stat(path)
if err != nil {
return "", errz.Wrap(err, "calculate file checksum")
}
buf := bytes.Buffer{}
buf.WriteString(fi.Name())
buf.WriteString(strconv.FormatInt(fi.ModTime().UnixNano(), 10))
buf.WriteString(strconv.FormatInt(fi.Size(), 10))
buf.WriteString(strconv.FormatUint(uint64(fi.Mode()), 10))
buf.WriteString(strconv.FormatBool(fi.IsDir()))
return Checksum(Sum(buf.Bytes())), nil
}
// ForHTTPHeader returns a checksum generated from URL u and
// the contents of header. If the header contains an Etag,
// that is used as the primary element. Otherwise, other
// values such as Content-Length and Last-Modified are
// considered.
//
// Deprecated: use ForHTTPResponse instead.
func ForHTTPHeader(u string, header http.Header) Checksum {
buf := bytes.Buffer{}
buf.WriteString(u)
if header != nil {
etag := header.Get("Etag")
if etag != "" {
buf.WriteString(etag)
} else {
buf.WriteString(header.Get("Content-Type"))
buf.WriteString(header.Get("Content-Disposition"))
buf.WriteString(header.Get("Content-Length"))
buf.WriteString(header.Get("Last-Modified"))
}
}
return Checksum(Sum(buf.Bytes()))
}
// ForHTTPResponse returns a checksum generated from the response's
// request URL and the contents of the response's header. If the header
// contains an Etag, that is used as the primary element. Otherwise,
// other values such as Content-Length and Last-Modified are considered.
//
// There's some trickiness with Etag. Note that by default, stdlib http.Client
// will set sneakily set the "Accept-Encoding: gzip" header on GET requests.
// However, this doesn't happen for HEAD requests. So, comparing a GET and HEAD
// response for the same URL may result in different checksums, because the
// server will likely return a different Etag for the gzipped response.
//
// # With gzip
// Etag: "069dbf690a12d5eb853feb8e04aeb49e-ssl-df"
//
// # Without gzip
// Etag: "069dbf690a12d5eb853feb8e04aeb49e-ssl"
//
// Note the "-ssl-df" suffix on the gzipped response. The "df" suffix is
// for "deflate".
//
// The solution here might be to always explicitly set the gzip header on all
// requests. However, when gzip is not explicitly set, the stdlib client
// transparently handles gzip compression, including on the body read end. So,
// ideally, we wouldn't change that part, so that we don't have to code for
// both compressed and uncompressed responses.
//
// Our hack for now it to trim the "-df" suffix from the Etag.
//
// REVISIT: ForHTTPResponse is no longer used. It should be removed.
func ForHTTPResponse(resp *http.Response) Checksum {
if resp == nil {
return ""
}
buf := bytes.Buffer{}
if resp.Request != nil && resp.Request.URL != nil {
buf.WriteString(resp.Request.URL.String() + "\n")
}
buf.WriteString(strconv.Itoa(int(resp.ContentLength)) + "\n")
header := resp.Header
if header != nil {
buf.WriteString(header.Get("Content-Encoding") + "\n")
etag := strings.TrimSpace(header.Get("Etag"))
if etag != "" {
etag = strings.TrimSuffix(etag, "-df")
buf.WriteString(etag + "\n")
} else {
buf.WriteString(header.Get("Content-Type") + "\n")
buf.WriteString(header.Get("Content-Disposition") + "\n")
buf.WriteString(header.Get("Content-Length") + "\n")
buf.WriteString(header.Get("Last-Modified") + "\n")
}
}
return Checksum(Sum(buf.Bytes()))
}