sq/libsq/files/download.go

package files

import (
	"context"
	"io"
	"net/http"
	"path/filepath"
	"time"

	"github.com/neilotoole/streamcache"

	"github.com/neilotoole/sq/libsq/core/errz"
	"github.com/neilotoole/sq/libsq/core/ioz"
	"github.com/neilotoole/sq/libsq/core/ioz/checksum"
	"github.com/neilotoole/sq/libsq/core/ioz/httpz"
	"github.com/neilotoole/sq/libsq/core/lg"
	"github.com/neilotoole/sq/libsq/core/lg/lgm"
	"github.com/neilotoole/sq/libsq/core/options"
	"github.com/neilotoole/sq/libsq/files/downloader"
	"github.com/neilotoole/sq/libsq/source"
)

var OptHTTPRequestTimeout = options.NewDuration(
	"http.request.timeout",
	"",
	0,
	time.Second*10,
	"HTTP/S request initial response timeout duration",
	`How long to wait for initial response from a HTTP/S endpoint before
timeout occurs. Reading the body of the response, such as a large HTTP file
download, is not affected by this option. Example: 500ms or 3s.
Contrast with http.response.timeout.`,
	options.TagSource,
)

var OptHTTPResponseTimeout = options.NewDuration(
	"http.response.timeout",
	"",
	0,
	0,
	"HTTP/S request completion timeout duration",
	`How long to wait for the entire HTTP transaction to complete. This includes
reading the body of the response, such as a large HTTP file download. Typically
this is set to 0, indicating no timeout. Contrast with http.request.timeout.`,
	options.TagSource,
)

var OptHTTPSInsecureSkipVerify = options.NewBool(
	"https.insecure-skip-verify",
	"",
	false,
	0,
	false,
	"Skip HTTPS TLS verification",
	"Skip HTTPS TLS verification. Useful when downloading against self-signed certs.",
	options.TagSource,
)

// maybeStartDownload starts a download for src if one is not already in progress
// or completed. If there's a download in progress, dlStream returns non-nil.
// If the file is already downloaded to disk (and is valid/fresh), dlFile
// returns non-empty and contains the absolute path to the downloaded file.
// Otherwise, a new download is started (on a spawned goroutine), and the
// stream returned from the downloader is added to Files.streams. On successful
// download and cache update completion, the stream is removed from Files.streams
// and the path to the cached file is added to Files.downloadedFiles.
//
// If arg checkFresh is false, and there's already a cached download on disk,
// then the cached file is returned immediately, and no download is started.
//
// It is guaranteed that one (and only one) of the returned values will be non-nil.
// REVISIT: look into use of checkFresh?
func (fs *Files) maybeStartDownload(ctx context.Context, src *source.Source, checkFresh bool) (dlFile string,
	dlStream *streamcache.Stream, err error,
) {
	var ok bool

	// If the file has just been downloaded, just return it. It doesn't
	// matter about checkFresh.
	if dlFile, ok = fs.downloadedFiles[src.Handle]; ok {
		return dlFile, nil, nil
	}

	// If there's already a download in progress, then we can just return
	// that stream.
	if dlStream, ok = fs.streams[src.Handle]; ok {
		// A download stream is always fresh, so we
		// can ignore checkFresh here.
		return "", dlStream, nil
	}

	dldr, err := fs.downloaderFor(ctx, src)
	if err != nil {
		return "", nil, err
	}

	if !checkFresh {
		// If we don't care about freshness, check if the download is
		// already on disk. If so, Downloader.CacheFile will return the
		// path to the cached file.
		dlFile, err = dldr.CacheFile(ctx)
		if err == nil && dlFile != "" {
			// The file is already on disk, so we can just return it.
			// REVISIT: Should we add dlFile to the downloadFiles map?
			return dlFile, nil, err
		}
	}

	// Having got this far, we need to talk to the downloader.
	var (
		dlErrCh    = make(chan error, 1)
		dlStreamCh = make(chan *streamcache.Stream, 1)
		dlFileCh   = make(chan string, 1)
	)

	// Our handler simply pushes the callback values into the channels, which
	// this main goroutine will select on at the bottom of the func. The call
	// to downloader.Get will be executed in a newly spawned goroutine below.
	h := downloader.Handler{
		Cached:   func(dlFile string) { dlFileCh <- dlFile },
		Uncached: func(dlStream *streamcache.Stream) { dlStreamCh <- dlStream },
		Error:    func(dlErr error) { dlErrCh <- dlErr },
	}

	go func() {
		// Spawn a goroutine to execute the download process.
		// The handler will be called before Get returns.
		cacheFile := dldr.Get(ctx, h)
		if cacheFile == "" {
			// Either the download failed, or cache update failed.
			return
		}

		// The download succeeded, and the cache was successfully updated.
		// We know that cacheFile exists now. If a stream was created (and
		// thus added to Files.streams), we can swap it out and instead
		// populate Files.downloadedFiles with the cacheFile. Thus, going
		// forward, any clients of Files will get the cacheFile instead of
		// the stream.

		// We need to lock here because we're accessing Files.streams.
		// So, this goroutine will block until the lock is available.
		// That shouldn't be an issue: the up-stack Files function that
		// acquired the lock will eventually return, releasing the lock,
		// at which point the swap will happen. No big deal.
		fs.mu.Lock()
		defer fs.mu.Unlock()

		if stream, ok := fs.streams[src.Handle]; ok && stream != nil {
			// The stream exists, and it's safe to close the stream's source,
			// (i.e. the http response body), because the body has already
			// been completely drained by the downloader: otherwise, we
			// wouldn't have a non-empty value for cacheFile.
			if c, ok := stream.Source().(io.Closer); ok {
				lg.WarnIfCloseError(lg.FromContext(ctx), lgm.CloseHTTPResponseBody, c)
			}
		}

		// Now perform the swap: populate Files.downloadedFiles with cacheFile,
		// and remove the stream from Files.streams.
		fs.downloadedFiles[src.Handle] = cacheFile
		delete(fs.streams, src.Handle)
	}() // end of goroutine

	// Here we wait on the handler channels.
	select {
	case <-ctx.Done():
		return "", nil, errz.Err(ctx.Err())
	case err = <-dlErrCh:
		return "", nil, err
	case dlStream = <-dlStreamCh:
		// New download stream. Add it to Files.streams,
		// and return the stream.
		fs.streams[src.Handle] = dlStream
		return "", dlStream, nil
	case dlFile = <-dlFileCh:
		// The file is already on disk, so we added it to Files.downloadedFiles,
		// and return its filepath.
		fs.downloadedFiles[src.Handle] = dlFile
		return dlFile, nil, nil
	}
}

// downloadPaths returns the paths for src's download cache dir and
// cache body file. It is not guaranteed that the returned paths exist.
func (fs *Files) downloadPaths(src *source.Source) (dlDir, dlFile string, err error) {
	var cacheDir string
	cacheDir, err = fs.CacheDirFor(src)
	if err != nil {
		return "", dlFile, err
	}

	// Note: we're depending on internal knowledge of the downloader impl here,
	// which is not great. It might be better to implement a function
	// in pkg downloader.
	dlDir = filepath.Join(cacheDir, "download", checksum.Sum([]byte(src.Location)))
	dlFile = filepath.Join(dlDir, "main", "body")
	return dlDir, dlFile, nil
}

// downloaderFor returns the downloader.Downloader for src, creating
// and caching it if necessary.
func (fs *Files) downloaderFor(ctx context.Context, src *source.Source) (*downloader.Downloader, error) {
	dl, ok := fs.downloaders[src.Handle]
	if ok {
		return dl, nil
	}

	dlDir, _, err := fs.downloadPaths(src)
	if err != nil {
		return nil, err
	}
	if err = ioz.RequireDir(dlDir); err != nil {
		return nil, err
	}

	c := fs.httpClientFor(ctx, src)
	if dl, err = downloader.New(src.Handle, c, src.Location, dlDir); err != nil {
		return nil, err
	}
	fs.downloaders[src.Handle] = dl
	return dl, nil
}

func (fs *Files) httpClientFor(ctx context.Context, src *source.Source) *http.Client {
	o := options.Merge(options.FromContext(ctx), src.Options)
	return httpz.NewClient(httpz.DefaultUserAgent,
		httpz.OptRequestTimeout(OptHTTPRequestTimeout.Get(o)),
		httpz.OptResponseTimeout(OptHTTPResponseTimeout.Get(o)),
		httpz.OptInsecureSkipVerify(OptHTTPSInsecureSkipVerify.Get(o)),
	)
}