mirror of
https://github.com/neilotoole/sq.git
synced 2024-12-18 13:41:49 +03:00
cf0908abfa
* Implement json ingestion stmt caching
223 lines
6.2 KiB
Go
223 lines
6.2 KiB
Go
// Package checksum provides functions for working with checksums.
|
|
// It uses crc32 for the checksum algorithm, resulting in checksum
|
|
// values like "3af3aaad".
|
|
package checksum
|
|
|
|
import (
|
|
"bufio"
|
|
"bytes"
|
|
"crypto/rand"
|
|
"fmt"
|
|
"hash/crc32"
|
|
"io"
|
|
"net/http"
|
|
"os"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"github.com/neilotoole/sq/libsq/core/errz"
|
|
"github.com/neilotoole/sq/libsq/core/ioz"
|
|
)
|
|
|
|
// Sum returns the hash of b as a hex string.
|
|
// If b is empty, empty string is returned.
|
|
func Sum(b []byte) string {
|
|
if len(b) == 0 {
|
|
return ""
|
|
}
|
|
sum := crc32.ChecksumIEEE(b)
|
|
return fmt.Sprintf("%x", sum)
|
|
}
|
|
|
|
// SumAll returns the hash of a, and all the elements
|
|
// of b, as a hex string.
|
|
func SumAll[T ~string](a T, b ...T) string {
|
|
h := crc32.NewIEEE()
|
|
_, _ = h.Write([]byte(a))
|
|
for _, col := range b {
|
|
_, _ = h.Write([]byte(col))
|
|
}
|
|
return fmt.Sprintf("%x", h.Sum32())
|
|
}
|
|
|
|
// Rand returns a random checksum.
|
|
func Rand() string {
|
|
b := make([]byte, 128)
|
|
_, _ = rand.Read(b)
|
|
return Sum(b)
|
|
}
|
|
|
|
// Checksum is a checksum value.
|
|
type Checksum string
|
|
|
|
// Write appends a checksum line to w, including
|
|
// a newline. The typical format is:
|
|
//
|
|
// <sum> <name>
|
|
// 3610a686 file.txt
|
|
//
|
|
// However, the checksum be any string value. Use ForFile to calculate
|
|
// a checksum, and Read to read this format.
|
|
func Write(w io.Writer, sum Checksum, name string) error {
|
|
_, err := fmt.Fprintf(w, "%s %s\n", sum, name)
|
|
return errz.Err(err)
|
|
}
|
|
|
|
// WriteFile writes a single {checksum,name} to path, overwriting
|
|
// the previous contents.
|
|
//
|
|
// See: Write.
|
|
func WriteFile(path string, sum Checksum, name string) error {
|
|
f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, ioz.RWPerms)
|
|
if err != nil {
|
|
return errz.Wrap(err, "write checksum file")
|
|
}
|
|
err = Write(f, sum, name)
|
|
if err == nil {
|
|
return errz.Err(f.Close())
|
|
}
|
|
|
|
_ = f.Close()
|
|
return err
|
|
}
|
|
|
|
// ReadFile reads a checksum file from path.
|
|
//
|
|
// See Read for details.
|
|
func ReadFile(path string) (map[string]Checksum, error) {
|
|
f, err := os.Open(path)
|
|
if err != nil {
|
|
return nil, errz.Err(err)
|
|
}
|
|
|
|
defer func() { _ = f.Close() }()
|
|
|
|
return Read(f)
|
|
}
|
|
|
|
// Read reads checksums lines from r, returning a map
|
|
// of checksums keyed by name. Empty lines, and lines beginning
|
|
// with "#" (comments) are ignored. This function is the
|
|
// inverse of Write.
|
|
func Read(r io.Reader) (map[string]Checksum, error) {
|
|
sums := map[string]Checksum{}
|
|
|
|
sc := bufio.NewScanner(r)
|
|
for sc.Scan() {
|
|
line := strings.TrimSpace(sc.Text())
|
|
if line == "" || strings.HasPrefix(line, "#") {
|
|
continue
|
|
}
|
|
|
|
parts := strings.SplitN(line, " ", 2)
|
|
if len(parts) != 2 {
|
|
return nil, errz.Errorf("invalid checksum line: %q", line)
|
|
}
|
|
|
|
sums[parts[1]] = Checksum(parts[0])
|
|
}
|
|
|
|
return sums, errz.Wrap(sc.Err(), "read checksums")
|
|
}
|
|
|
|
// ForFile returns a checksum of the file at path.
|
|
// The checksum is based on the file's name, size, mode, and
|
|
// modification time. File contents are not read.
|
|
func ForFile(path string) (Checksum, error) {
|
|
fi, err := os.Stat(path)
|
|
if err != nil {
|
|
return "", errz.Wrap(err, "calculate file checksum")
|
|
}
|
|
|
|
buf := bytes.Buffer{}
|
|
buf.WriteString(fi.Name())
|
|
buf.WriteString(strconv.FormatInt(fi.ModTime().UnixNano(), 10))
|
|
buf.WriteString(strconv.FormatInt(fi.Size(), 10))
|
|
buf.WriteString(strconv.FormatUint(uint64(fi.Mode()), 10))
|
|
buf.WriteString(strconv.FormatBool(fi.IsDir()))
|
|
|
|
return Checksum(Sum(buf.Bytes())), nil
|
|
}
|
|
|
|
// ForHTTPHeader returns a checksum generated from URL u and
|
|
// the contents of header. If the header contains an Etag,
|
|
// that is used as the primary element. Otherwise, other
|
|
// values such as Content-Length and Last-Modified are
|
|
// considered.
|
|
//
|
|
// Deprecated: use ForHTTPResponse instead.
|
|
func ForHTTPHeader(u string, header http.Header) Checksum {
|
|
buf := bytes.Buffer{}
|
|
buf.WriteString(u)
|
|
if header != nil {
|
|
etag := header.Get("Etag")
|
|
if etag != "" {
|
|
buf.WriteString(etag)
|
|
} else {
|
|
buf.WriteString(header.Get("Content-Type"))
|
|
buf.WriteString(header.Get("Content-Disposition"))
|
|
buf.WriteString(header.Get("Content-Length"))
|
|
buf.WriteString(header.Get("Last-Modified"))
|
|
}
|
|
}
|
|
|
|
return Checksum(Sum(buf.Bytes()))
|
|
}
|
|
|
|
// ForHTTPResponse returns a checksum generated from the response's
|
|
// request URL and the contents of the response's header. If the header
|
|
// contains an Etag, that is used as the primary element. Otherwise,
|
|
// other values such as Content-Length and Last-Modified are considered.
|
|
//
|
|
// There's some trickiness with Etag. Note that by default, stdlib http.Client
|
|
// will set sneakily set the "Accept-Encoding: gzip" header on GET requests.
|
|
// However, this doesn't happen for HEAD requests. So, comparing a GET and HEAD
|
|
// response for the same URL may result in different checksums, because the
|
|
// server will likely return a different Etag for the gzipped response.
|
|
//
|
|
// # With gzip
|
|
// Etag: "069dbf690a12d5eb853feb8e04aeb49e-ssl-df"
|
|
//
|
|
// # Without gzip
|
|
// Etag: "069dbf690a12d5eb853feb8e04aeb49e-ssl"
|
|
//
|
|
// Note the "-ssl-df" suffix on the gzipped response. The "df" suffix is
|
|
// for "deflate".
|
|
//
|
|
// The solution here might be to always explicitly set the gzip header on all
|
|
// requests. However, when gzip is not explicitly set, the stdlib client
|
|
// transparently handles gzip compression, including on the body read end. So,
|
|
// ideally, we wouldn't change that part, so that we don't have to code for
|
|
// both compressed and uncompressed responses.
|
|
//
|
|
// Our hack for now it to trim the "-df" suffix from the Etag.
|
|
//
|
|
// REVISIT: ForHTTPResponse is no longer used. It should be removed.
|
|
func ForHTTPResponse(resp *http.Response) Checksum {
|
|
if resp == nil {
|
|
return ""
|
|
}
|
|
|
|
buf := bytes.Buffer{}
|
|
if resp.Request != nil && resp.Request.URL != nil {
|
|
buf.WriteString(resp.Request.URL.String() + "\n")
|
|
}
|
|
buf.WriteString(strconv.Itoa(int(resp.ContentLength)) + "\n")
|
|
header := resp.Header
|
|
if header != nil {
|
|
buf.WriteString(header.Get("Content-Encoding") + "\n")
|
|
etag := strings.TrimSpace(header.Get("Etag"))
|
|
if etag != "" {
|
|
etag = strings.TrimSuffix(etag, "-df")
|
|
buf.WriteString(etag + "\n")
|
|
} else {
|
|
buf.WriteString(header.Get("Content-Type") + "\n")
|
|
buf.WriteString(header.Get("Content-Disposition") + "\n")
|
|
buf.WriteString(header.Get("Content-Length") + "\n")
|
|
buf.WriteString(header.Get("Last-Modified") + "\n")
|
|
}
|
|
}
|
|
|
|
return Checksum(Sum(buf.Bytes()))
|
|
}
|