Merge pull request #301 from numtide/feat/better-concurrency-model

feat: simplify pipeline model
2024-10-05 21:17:37 +03:00 · 2024-05-26 20:11:48 +00:00 · 2024-05-26 20:11:48 +00:00 · 06f15f4237
commit 06f15f4237
parent 6ce3d27519 ce14ee828f
12 changed files with 147 additions and 229 deletions
--- a/cli/format.go
+++ b/cli/format.go
@ -30,11 +30,11 @@ const (
 )

 var (
-	globalExcludes []glob.Glob
-	formatters     map[string]*format.Formatter
-	pipelines      map[string]*format.Pipeline
-	filesCh        chan *walk.File
-	processedCh    chan *walk.File
+	excludes   []glob.Glob
+	formatters map[string]*format.Formatter
+
+	filesCh     chan *walk.File
+	processedCh chan *walk.File

 	ErrFailOnChange = errors.New("unexpected changes detected, --fail-on-change is enabled")
 )
@ -73,21 +73,18 @@ func (f *Format) Run() (err error) {
 	}

 	// compile global exclude globs
-	if globalExcludes, err = format.CompileGlobs(cfg.Global.Excludes); err != nil {
+	if excludes, err = format.CompileGlobs(cfg.Global.Excludes); err != nil {
 		return fmt.Errorf("failed to compile global excludes: %w", err)
 	}

-	// initialise pipelines
-	pipelines = make(map[string]*format.Pipeline)
+	// initialise formatters
 	formatters = make(map[string]*format.Formatter)

-	// iterate the formatters in lexicographical order
-	for _, name := range cfg.Names {
-		// init formatter
-		formatterCfg := cfg.Formatters[name]
-		formatter, err := format.NewFormatter(name, Cli.TreeRoot, formatterCfg, globalExcludes)
+	for name, formatterCfg := range cfg.Formatters {
+		formatter, err := format.NewFormatter(name, Cli.TreeRoot, formatterCfg, excludes)
+
 		if errors.Is(err, format.ErrCommandNotFound) && Cli.AllowMissingFormatter {
-			log.Debugf("formatter not found: %v", name)
+			log.Debugf("formatter command not found: %v", name)
 			continue
 		} else if err != nil {
 			return fmt.Errorf("%w: failed to initialise formatter: %v", err, name)
@ -95,23 +92,6 @@ func (f *Format) Run() (err error) {

 		// store formatter by name
 		formatters[name] = formatter
-
-		// If no pipeline is configured, we add the formatter to a nominal pipeline of size 1 with the key being the
-		// formatter's name. If a pipeline is configured, we add the formatter to a pipeline keyed by
-		// 'p:<pipeline_name>' in which it is sorted by priority.
-		if formatterCfg.Pipeline == "" {
-			pipeline := format.Pipeline{}
-			pipeline.Add(formatter)
-			pipelines[name] = &pipeline
-		} else {
-			key := fmt.Sprintf("p:%s", formatterCfg.Pipeline)
-			pipeline, ok := pipelines[key]
-			if !ok {
-				pipeline = &format.Pipeline{}
-				pipelines[key] = pipeline
-			}
-			pipeline.Add(formatter)
-		}
 	}

 	// open the cache
@ -304,37 +284,43 @@ func walkFilesystem(ctx context.Context) func() error {
 func applyFormatters(ctx context.Context) func() error {
 	// create our own errgroup for concurrent formatting tasks
 	fg, ctx := errgroup.WithContext(ctx)
+	// simple optimization to avoid too many concurrent formatting tasks
+	// we can queue them up faster than the formatters can process them, this paces things a bit
+	fg.SetLimit(runtime.NumCPU())

-	// pre-initialise batches keyed by pipeline
-	batches := make(map[string][]*walk.File)
-	for key := range pipelines {
-		batches[key] = make([]*walk.File, 0, BatchSize)
-	}
+	// track batches of formatting task based on their batch keys, which are determined by the unique sequence of
+	// formatters which should be applied to their respective files
+	batches := make(map[string][]*format.Task)

-	// for a given pipeline key, add the provided file to the current batch and trigger a format if the batch size has
-	// been reached
-	tryApply := func(key string, file *walk.File) {
-		// append to batch
-		batches[key] = append(batches[key], file)
-
-		// check if the batch is full
+	apply := func(key string, flush bool) {
+		// lookup the batch and exit early if it's empty
 		batch := batches[key]
-		if len(batch) == BatchSize {
-			// get the pipeline
-			pipeline := pipelines[key]
+		if len(batch) == 0 {
+			return
+		}

-			// copy the batch
-			files := make([]*walk.File, len(batch))
-			copy(files, batch)
+		// process the batch if it's full, or we've been asked to flush partial batches
+		if flush || len(batch) == BatchSize {

-			// apply to the pipeline
+			// copy the batch as we re-use it for the next batch
+			tasks := make([]*format.Task, len(batch))
+			copy(tasks, batch)
+
+			// asynchronously apply the sequence formatters to the batch
 			fg.Go(func() error {
-				if err := pipeline.Apply(ctx, files); err != nil {
-					return err
+				// iterate the formatters, applying them in sequence to the batch of tasks
+				// we get the formatters list from the first task since they have all the same formatters list
+				for _, f := range tasks[0].Formatters {
+					if err := f.Apply(ctx, tasks); err != nil {
+						return err
+					}
 				}
-				for _, path := range files {
-					processedCh <- path
+
+				// pass each file to the processed channel
+				for _, task := range tasks {
+					processedCh <- task.File
 				}
+
 				return nil
 			})

@ -343,25 +329,12 @@ func applyFormatters(ctx context.Context) func() error {
 		}
 	}

-	// format any partial batches
-	flushBatches := func() {
-		for key, pipeline := range pipelines {
-
-			batch := batches[key]
-			pipeline := pipeline // capture for closure
-
-			if len(batch) > 0 {
-				fg.Go(func() error {
-					if err := pipeline.Apply(ctx, batch); err != nil {
-						return fmt.Errorf("%s failure: %w", key, err)
-					}
-					for _, path := range batch {
-						processedCh <- path
-					}
-					return nil
-				})
-			}
-		}
+	tryApply := func(task *format.Task) {
+		// append to batch
+		key := task.BatchKey
+		batches[key] = append(batches[key], task)
+		// try to apply
+		apply(key, false)
 	}

 	return func() error {
@ -370,35 +343,38 @@ func applyFormatters(ctx context.Context) func() error {
 			close(processedCh)
 		}()

-		// iterate the files channel, checking if any pipeline wants it, and attempting to apply if so.
+		// iterate the files channel
 		for file := range filesCh {
-			var matches []string

-			for key, pipeline := range pipelines {
-				if !pipeline.Wants(file) {
-					continue
+			// determine a list of formatters that are interested in file
+			var matches []*format.Formatter
+			for _, formatter := range formatters {
+				if formatter.Wants(file) {
+					matches = append(matches, formatter)
 				}
-				matches = append(matches, key)
-				tryApply(key, file)
 			}
-			switch len(matches) {
-			case 0:
-				log.Debugf("no match found: %s", file.Path)
+
+			if len(matches) == 0 {
 				// no match, so we send it direct to the processed channel
+				log.Debugf("no match found: %s", file.Path)
 				processedCh <- file
-			case 1:
+			} else {
+				// record the match
 				stats.Add(stats.Matched, 1)
-			default:
-				return fmt.Errorf("path '%s' matched multiple formatters/pipelines %v", file.Path, matches)
+				// create a new format task, add it to a batch based on its batch key and try to apply if the batch is full
+				task := format.NewTask(file, matches)
+				tryApply(&task)
 			}
 		}

 		// flush any partial batches which remain
-		flushBatches()
+		for key := range batches {
+			apply(key, true)
+		}

 		// wait for all outstanding formatting tasks to complete
 		if err := fg.Wait(); err != nil {
-			return fmt.Errorf("pipeline processing failure: %w", err)
+			return fmt.Errorf("formatting failure: %w", err)
 		}
 		return nil
 	}
--- a/cli/format_test.go
+++ b/cli/format_test.go
@ -196,37 +196,6 @@ func TestIncludesAndExcludes(t *testing.T) {
 	assertStats(t, as, 31, 31, 2, 0)
 }

-func TestMatchingMultiplePipelines(t *testing.T) {
-	as := require.New(t)
-
-	tempDir := test.TempExamples(t)
-	configPath := tempDir + "/multiple.toml"
-
-	cfg := config2.Config{
-		Formatters: map[string]*config2.Formatter{
-			"echo": {
-				Command:  "echo",
-				Includes: []string{"*"},
-			},
-			"touch": {
-				Command:  "touch",
-				Includes: []string{"*"},
-			},
-		},
-	}
-
-	test.WriteConfig(t, configPath, cfg)
-	_, err := cmd(t, "-c", "--config-file", configPath, "--tree-root", tempDir)
-	as.ErrorContains(err, "matched multiple formatters/pipelines")
-	as.ErrorContains(err, "echo")
-	as.ErrorContains(err, "touch")
-
-	// run with only one formatter
-	test.WriteConfig(t, configPath, cfg)
-	_, err = cmd(t, "-c", "--config-file", configPath, "--tree-root", tempDir, "-f", "echo")
-	as.NoError(err)
-}
-
 func TestCache(t *testing.T) {
 	as := require.New(t)

@ -604,25 +573,23 @@ func TestDeterministicOrderingInPipeline(t *testing.T) {

 	test.WriteConfig(t, configPath, config2.Config{
 		Formatters: map[string]*config2.Formatter{
-			// a and b should execute in lexicographical order as they have default priority 0, with c last since it has
-			// priority 1
+			// a and b have no priority set, which means they default to 0 and should execute first
+			// a and b should execute in lexicographical order
+			// c should execute first since it has a priority of 1
 			"fmt-a": {
 				Command:  "test-fmt",
 				Options:  []string{"fmt-a"},
 				Includes: []string{"*.py"},
-				Pipeline: "foo",
 			},
 			"fmt-b": {
 				Command:  "test-fmt",
 				Options:  []string{"fmt-b"},
 				Includes: []string{"*.py"},
-				Pipeline: "foo",
 			},
 			"fmt-c": {
 				Command:  "test-fmt",
 				Options:  []string{"fmt-c"},
 				Includes: []string{"*.py"},
-				Pipeline: "foo",
 				Priority: 1,
 			},
 		},
--- a/config/config.go
+++ b/config/config.go
@ -2,7 +2,6 @@ package config

 import (
 	"fmt"
-	"sort"

 	"github.com/BurntSushi/toml"
 )
@ -13,7 +12,6 @@ type Config struct {
 		// Excludes is an optional list of glob patterns used to exclude certain files from all formatters.
 		Excludes []string
 	}
-	Names      []string              `toml:"-"`
 	Formatters map[string]*Formatter `toml:"formatter"`
 }

@ -40,12 +38,5 @@ func ReadFile(path string, names []string) (cfg *Config, err error) {
 		cfg.Formatters = filtered
 	}

-	// sort the formatter names so that, as we construct pipelines, we add formatters in a determinstic fashion. This
-	// ensures a deterministic order even when all priority values are the same e.g. 0
-	for name := range cfg.Formatters {
-		cfg.Names = append(cfg.Names, name)
-	}
-	sort.Strings(cfg.Names)
-
 	return
 }
--- a/config/config_test.go
+++ b/config/config_test.go
@ -59,7 +59,6 @@ func TestReadConfigFile(t *testing.T) {
 	as.Nil(alejandra.Options)
 	as.Equal([]string{"*.nix"}, alejandra.Includes)
 	as.Equal([]string{"examples/nix/sources.nix"}, alejandra.Excludes)
-	as.Equal("nix", alejandra.Pipeline)
 	as.Equal(1, alejandra.Priority)

 	// deadnix
@ -69,7 +68,6 @@ func TestReadConfigFile(t *testing.T) {
 	as.Nil(deadnix.Options)
 	as.Equal([]string{"*.nix"}, deadnix.Includes)
 	as.Nil(deadnix.Excludes)
-	as.Equal("nix", deadnix.Pipeline)
 	as.Equal(2, deadnix.Priority)

 	// ruby
--- a/config/formatter.go
+++ b/config/formatter.go
@ -9,8 +9,6 @@ type Formatter struct {
 	Includes []string
 	// Excludes is an optional list of glob patterns used to exclude certain files from this Formatter.
 	Excludes []string
-	// Indicates this formatter should be executed as part of a group of formatters all sharing the same pipeline key.
-	Pipeline string
-	// Indicates the order of precedence when executing as part of a pipeline.
+	// Indicates the order of precedence when executing this Formatter in a sequence of Formatters.
 	Priority int
 }
--- a/docs/configure.md
+++ b/docs/configure.md
@ -23,19 +23,19 @@ includes = ["*.go"]
 command = "black"
 includes = ["*.py"]

+# use the priority field to control the order of execution
+
 # run shellcheck first
 [formatter.shellcheck]
 command = "shellcheck"
 includes = ["*.sh"]
-pipeline = "sh"
-priority = 0
+priority = 0    # default is 0, but we set it here for clarity

 # shfmt second
 [formatter.shfmt]
 command = "shfmt"
 options = ["-s", "-w"]
 includes = ["*.sh"]
-pipeline = "sh"
 priority = 1
 ```

@ -49,13 +49,21 @@ priority = 1
 -   `options` - an optional list of args to be passed to `command`.
 -   `includes` - a list of glob patterns used to determine whether the formatter should be applied against a given path.
 -   `excludes` - an optional list of glob patterns used to exclude certain files from this formatter.
-   `pipeline` - an optional key used to group related formatters together, ensuring they are executed sequentially
-    against a given path.
-   `priority` - indicates the order of execution when this formatter is operating as part of a pipeline. Greater
-    precedence is given to lower numbers, with the default being `0`.
+-   `priority` - influences the order of execution. Greater precedence is given to lower numbers, with the default being `0`.

-> When two or more formatters in a pipeline have the same priority, they are executed in lexicographical order to
-> ensure deterministic behaviour over multiple executions.
+## Same file, multiple formatters?
+
+For each file, `treefmt` determines a list of formatters based on the configured `includes` / `excludes` rules. This list is
+then sorted, first by priority (lower the value, higher the precedence) and secondly by formatter name (lexicographically).
+
+The resultant sequence of formatters is used to create a batch key, and similarly matched files get added to that batch
+until it is full, at which point the files are passed to each formatter in turn.
+
+This means that `treefmt` **guarantees only one formatter will be operating on a given file at any point in time**.
+Another consequence is that formatting is deterministic for a given file and a given `treefmt` configuration.
+
+By setting the priority fields appropriately, you can control the order in which those formatters are applied for any
+files they _both happen to match on_.

 ## Supported Formatters

--- a/format/formatter.go
+++ b/format/formatter.go
@ -40,43 +40,28 @@ func (f *Formatter) Executable() string {
 	return f.executable
 }

-func (f *Formatter) Apply(ctx context.Context, files []*walk.File, filter bool) error {
+func (f *Formatter) Name() string {
+	return f.name
+}
+
+func (f *Formatter) Priority() int {
+	return f.config.Priority
+}
+
+func (f *Formatter) Apply(ctx context.Context, tasks []*Task) error {
 	start := time.Now()

 	// construct args, starting with config
 	args := f.config.Options

-	// If filter is true it indicates we are executing as part of a pipeline.
-	// In such a scenario each formatter must sub filter the paths provided as different formatters might want different
-	// files in a pipeline.
-	if filter {
-		// reset the batch
-		f.batch = f.batch[:0]
+	// exit early if nothing to process
+	if len(tasks) == 0 {
+		return nil
+	}

-		// filter paths
-		for _, file := range files {
-			if f.Wants(file) {
-				f.batch = append(f.batch, file.RelPath)
-			}
-		}
-
-		// exit early if nothing to process
-		if len(f.batch) == 0 {
-			return nil
-		}
-
-		// append paths to the args
-		args = append(args, f.batch...)
-	} else {
-		// exit early if nothing to process
-		if len(files) == 0 {
-			return nil
-		}
-
-		// append paths to the args
-		for _, file := range files {
-			args = append(args, file.RelPath)
-		}
+	// append paths to the args
+	for _, task := range tasks {
+		args = append(args, task.File.RelPath)
 	}

 	// execute the command
@ -95,7 +80,7 @@ func (f *Formatter) Apply(ctx context.Context, files []*walk.File, filter bool)

 	//

-	f.log.Infof("%v files processed in %v", len(files), time.Now().Sub(start))
+	f.log.Infof("%v files processed in %v", len(tasks), time.Now().Sub(start))

 	return nil
 }
@ -136,10 +121,10 @@ func NewFormatter(
 	f.executable = executable

 	// initialise internal state
-	if cfg.Pipeline == "" {
-		f.log = log.WithPrefix(fmt.Sprintf("format | %s", name))
+	if cfg.Priority > 0 {
+		f.log = log.WithPrefix(fmt.Sprintf("format | %s[%d]", name, cfg.Priority))
 	} else {
-		f.log = log.WithPrefix(fmt.Sprintf("format | %s[%s]", cfg.Pipeline, name))
+		f.log = log.WithPrefix(fmt.Sprintf("format | %s", name))
 	}

 	f.includes, err = CompileGlobs(cfg.Includes)
--- a/format/pipeline.go
+++ b/format/pipeline.go
@ -1,40 +0,0 @@
-package format
-
-import (
-	"context"
-	"slices"
-
-	"git.numtide.com/numtide/treefmt/walk"
-)
-
-type Pipeline struct {
-	sequence []*Formatter
-}
-
-func (p *Pipeline) Add(f *Formatter) {
-	p.sequence = append(p.sequence, f)
-	// sort by priority in ascending order
-	slices.SortFunc(p.sequence, func(a, b *Formatter) int {
-		return a.config.Priority - b.config.Priority
-	})
-}
-
-func (p *Pipeline) Wants(path *walk.File) bool {
-	var match bool
-	for _, f := range p.sequence {
-		match = f.Wants(path)
-		if match {
-			break
-		}
-	}
-	return match
-}
-
-func (p *Pipeline) Apply(ctx context.Context, paths []*walk.File) error {
-	for _, f := range p.sequence {
-		if err := f.Apply(ctx, paths, len(p.sequence) > 1); err != nil {
-			return err
-		}
-	}
-	return nil
-}
--- a/format/task.go
+++ b/format/task.go
@ -0,0 +1,42 @@
+package format
+
+import (
+	"cmp"
+	"slices"
+
+	"git.numtide.com/numtide/treefmt/walk"
+)
+
+type Task struct {
+	File       *walk.File
+	Formatters []*Formatter
+	BatchKey   string
+}
+
+func NewTask(file *walk.File, formatters []*Formatter) Task {
+	// sort by priority in ascending order
+	slices.SortFunc(formatters, func(a, b *Formatter) int {
+		priorityA := a.Priority()
+		priorityB := b.Priority()
+
+		result := priorityA - priorityB
+		if result == 0 {
+			// formatters with the same priority are sorted lexicographically to ensure a deterministic outcome
+			result = cmp.Compare(a.Name(), b.Name())
+		}
+		return result
+	})
+
+	// construct a batch key which represents the unique sequence of formatters to be applied to file
+	var key string
+	for _, f := range formatters {
+		key += f.name + ":"
+	}
+	key = key[:len(key)-1]
+
+	return Task{
+		File:       file,
+		Formatters: formatters,
+		BatchKey:   key,
+	}
+}
--- a/nix/treefmt.nix
+++ b/nix/treefmt.nix
@ -24,17 +24,14 @@

      settings.formatter = {
        deadnix = {
-          pipeline = "nix";
          priority = 1;
        };

        statix = {
-          pipeline = "nix";
          priority = 2;
        };

        alejandra = {
-          pipeline = "nix";
          priority = 3;
        };

--- a/test/examples/nixpkgs.toml
+++ b/test/examples/nixpkgs.toml
@ -4,11 +4,9 @@
 command = "deadnix"
 options = ["--edit"]
 includes = ["*.nix"]
-pipeline = "nix"
 priority = 1

 [formatter.nixpkgs-fmt]
 command = "nixpkgs-fmt"
 includes = ["*.nix"]
-pipeline = "nix"
 priority = 2
--- a/test/examples/treefmt.toml
+++ b/test/examples/treefmt.toml
@ -31,13 +31,11 @@ command = "alejandra"
 includes = ["*.nix"]
 # Act as an example on how to exclude specific files
 excludes = ["examples/nix/sources.nix"]
-pipeline = "nix"
 priority = 1

 [formatter.deadnix]
 command = "deadnix"
 includes = ["*.nix"]
-pipeline = "nix"
 priority = 2

 [formatter.ruby]