From ce14ee828faa35aa39b95f9ac16a16881699506f Mon Sep 17 00:00:00 2001
From: Brian McGee <brian@bmcgee.ie>
Date: Fri, 24 May 2024 11:42:40 +0100
Subject: [PATCH] feat: simplify pipeline model

For each path we determine the list of formatters that are interested in formatting it. From there, we sort
the list of formatters first by priority (lower value, higher priority) and then by name (lexicographically).

With this information we create a batch key which is based on the unique sequence of formatters. When enough paths with the same sequence is ready we apply them in order to each formatter.

By default, with no special configuration, this model guarantees that a given path will only be processed by one formatter at a time.

If a user wishes to influence the order in which formatters are applied they can use the priority field.

Signed-off-by: Brian McGee <brian@bmcgee.ie>
---
 cli/format.go              | 152 ++++++++++++++++---------------------
 cli/format_test.go         |  39 +---------
 config/config.go           |   9 ---
 config/config_test.go      |   2 -
 config/formatter.go        |   4 +-
 docs/configure.md          |  26 ++++---
 format/formatter.go        |  55 +++++---------
 format/pipeline.go         |  40 ----------
 format/task.go             |  42 ++++++++++
 nix/treefmt.nix            |   3 -
 test/examples/nixpkgs.toml |   2 -
 test/examples/treefmt.toml |   2 -
 12 files changed, 147 insertions(+), 229 deletions(-)
 delete mode 100644 format/pipeline.go
 create mode 100644 format/task.go

diff --git a/cli/format.go b/cli/format.go
index 4b14220..e1259f6 100644
--- a/cli/format.go
+++ b/cli/format.go
@@ -30,11 +30,11 @@ const (
 )
 
 var (
-	globalExcludes []glob.Glob
-	formatters     map[string]*format.Formatter
-	pipelines      map[string]*format.Pipeline
-	filesCh        chan *walk.File
-	processedCh    chan *walk.File
+	excludes   []glob.Glob
+	formatters map[string]*format.Formatter
+
+	filesCh     chan *walk.File
+	processedCh chan *walk.File
 
 	ErrFailOnChange = errors.New("unexpected changes detected, --fail-on-change is enabled")
 )
@@ -73,21 +73,18 @@ func (f *Format) Run() (err error) {
 	}
 
 	// compile global exclude globs
-	if globalExcludes, err = format.CompileGlobs(cfg.Global.Excludes); err != nil {
+	if excludes, err = format.CompileGlobs(cfg.Global.Excludes); err != nil {
 		return fmt.Errorf("failed to compile global excludes: %w", err)
 	}
 
-	// initialise pipelines
-	pipelines = make(map[string]*format.Pipeline)
+	// initialise formatters
 	formatters = make(map[string]*format.Formatter)
 
-	// iterate the formatters in lexicographical order
-	for _, name := range cfg.Names {
-		// init formatter
-		formatterCfg := cfg.Formatters[name]
-		formatter, err := format.NewFormatter(name, Cli.TreeRoot, formatterCfg, globalExcludes)
+	for name, formatterCfg := range cfg.Formatters {
+		formatter, err := format.NewFormatter(name, Cli.TreeRoot, formatterCfg, excludes)
+
 		if errors.Is(err, format.ErrCommandNotFound) && Cli.AllowMissingFormatter {
-			log.Debugf("formatter not found: %v", name)
+			log.Debugf("formatter command not found: %v", name)
 			continue
 		} else if err != nil {
 			return fmt.Errorf("%w: failed to initialise formatter: %v", err, name)
@@ -95,23 +92,6 @@ func (f *Format) Run() (err error) {
 
 		// store formatter by name
 		formatters[name] = formatter
-
-		// If no pipeline is configured, we add the formatter to a nominal pipeline of size 1 with the key being the
-		// formatter's name. If a pipeline is configured, we add the formatter to a pipeline keyed by
-		// 'p:<pipeline_name>' in which it is sorted by priority.
-		if formatterCfg.Pipeline == "" {
-			pipeline := format.Pipeline{}
-			pipeline.Add(formatter)
-			pipelines[name] = &pipeline
-		} else {
-			key := fmt.Sprintf("p:%s", formatterCfg.Pipeline)
-			pipeline, ok := pipelines[key]
-			if !ok {
-				pipeline = &format.Pipeline{}
-				pipelines[key] = pipeline
-			}
-			pipeline.Add(formatter)
-		}
 	}
 
 	// open the cache
@@ -304,37 +284,43 @@ func walkFilesystem(ctx context.Context) func() error {
 func applyFormatters(ctx context.Context) func() error {
 	// create our own errgroup for concurrent formatting tasks
 	fg, ctx := errgroup.WithContext(ctx)
+	// simple optimization to avoid too many concurrent formatting tasks
+	// we can queue them up faster than the formatters can process them, this paces things a bit
+	fg.SetLimit(runtime.NumCPU())
 
-	// pre-initialise batches keyed by pipeline
-	batches := make(map[string][]*walk.File)
-	for key := range pipelines {
-		batches[key] = make([]*walk.File, 0, BatchSize)
-	}
+	// track batches of formatting task based on their batch keys, which are determined by the unique sequence of
+	// formatters which should be applied to their respective files
+	batches := make(map[string][]*format.Task)
 
-	// for a given pipeline key, add the provided file to the current batch and trigger a format if the batch size has
-	// been reached
-	tryApply := func(key string, file *walk.File) {
-		// append to batch
-		batches[key] = append(batches[key], file)
-
-		// check if the batch is full
+	apply := func(key string, flush bool) {
+		// lookup the batch and exit early if it's empty
 		batch := batches[key]
-		if len(batch) == BatchSize {
-			// get the pipeline
-			pipeline := pipelines[key]
+		if len(batch) == 0 {
+			return
+		}
 
-			// copy the batch
-			files := make([]*walk.File, len(batch))
-			copy(files, batch)
+		// process the batch if it's full, or we've been asked to flush partial batches
+		if flush || len(batch) == BatchSize {
 
-			// apply to the pipeline
+			// copy the batch as we re-use it for the next batch
+			tasks := make([]*format.Task, len(batch))
+			copy(tasks, batch)
+
+			// asynchronously apply the sequence formatters to the batch
 			fg.Go(func() error {
-				if err := pipeline.Apply(ctx, files); err != nil {
-					return err
+				// iterate the formatters, applying them in sequence to the batch of tasks
+				// we get the formatters list from the first task since they have all the same formatters list
+				for _, f := range tasks[0].Formatters {
+					if err := f.Apply(ctx, tasks); err != nil {
+						return err
+					}
 				}
-				for _, path := range files {
-					processedCh <- path
+
+				// pass each file to the processed channel
+				for _, task := range tasks {
+					processedCh <- task.File
 				}
+
 				return nil
 			})
 
@@ -343,25 +329,12 @@ func applyFormatters(ctx context.Context) func() error {
 		}
 	}
 
-	// format any partial batches
-	flushBatches := func() {
-		for key, pipeline := range pipelines {
-
-			batch := batches[key]
-			pipeline := pipeline // capture for closure
-
-			if len(batch) > 0 {
-				fg.Go(func() error {
-					if err := pipeline.Apply(ctx, batch); err != nil {
-						return fmt.Errorf("%s failure: %w", key, err)
-					}
-					for _, path := range batch {
-						processedCh <- path
-					}
-					return nil
-				})
-			}
-		}
+	tryApply := func(task *format.Task) {
+		// append to batch
+		key := task.BatchKey
+		batches[key] = append(batches[key], task)
+		// try to apply
+		apply(key, false)
 	}
 
 	return func() error {
@@ -370,35 +343,38 @@ func applyFormatters(ctx context.Context) func() error {
 			close(processedCh)
 		}()
 
-		// iterate the files channel, checking if any pipeline wants it, and attempting to apply if so.
+		// iterate the files channel
 		for file := range filesCh {
-			var matches []string
 
-			for key, pipeline := range pipelines {
-				if !pipeline.Wants(file) {
-					continue
+			// determine a list of formatters that are interested in file
+			var matches []*format.Formatter
+			for _, formatter := range formatters {
+				if formatter.Wants(file) {
+					matches = append(matches, formatter)
 				}
-				matches = append(matches, key)
-				tryApply(key, file)
 			}
-			switch len(matches) {
-			case 0:
-				log.Debugf("no match found: %s", file.Path)
+
+			if len(matches) == 0 {
 				// no match, so we send it direct to the processed channel
+				log.Debugf("no match found: %s", file.Path)
 				processedCh <- file
-			case 1:
+			} else {
+				// record the match
 				stats.Add(stats.Matched, 1)
-			default:
-				return fmt.Errorf("path '%s' matched multiple formatters/pipelines %v", file.Path, matches)
+				// create a new format task, add it to a batch based on its batch key and try to apply if the batch is full
+				task := format.NewTask(file, matches)
+				tryApply(&task)
 			}
 		}
 
 		// flush any partial batches which remain
-		flushBatches()
+		for key := range batches {
+			apply(key, true)
+		}
 
 		// wait for all outstanding formatting tasks to complete
 		if err := fg.Wait(); err != nil {
-			return fmt.Errorf("pipeline processing failure: %w", err)
+			return fmt.Errorf("formatting failure: %w", err)
 		}
 		return nil
 	}
diff --git a/cli/format_test.go b/cli/format_test.go
index e4c3b2a..559accc 100644
--- a/cli/format_test.go
+++ b/cli/format_test.go
@@ -196,37 +196,6 @@ func TestIncludesAndExcludes(t *testing.T) {
 	assertStats(t, as, 31, 31, 2, 0)
 }
 
-func TestMatchingMultiplePipelines(t *testing.T) {
-	as := require.New(t)
-
-	tempDir := test.TempExamples(t)
-	configPath := tempDir + "/multiple.toml"
-
-	cfg := config2.Config{
-		Formatters: map[string]*config2.Formatter{
-			"echo": {
-				Command:  "echo",
-				Includes: []string{"*"},
-			},
-			"touch": {
-				Command:  "touch",
-				Includes: []string{"*"},
-			},
-		},
-	}
-
-	test.WriteConfig(t, configPath, cfg)
-	_, err := cmd(t, "-c", "--config-file", configPath, "--tree-root", tempDir)
-	as.ErrorContains(err, "matched multiple formatters/pipelines")
-	as.ErrorContains(err, "echo")
-	as.ErrorContains(err, "touch")
-
-	// run with only one formatter
-	test.WriteConfig(t, configPath, cfg)
-	_, err = cmd(t, "-c", "--config-file", configPath, "--tree-root", tempDir, "-f", "echo")
-	as.NoError(err)
-}
-
 func TestCache(t *testing.T) {
 	as := require.New(t)
 
@@ -604,25 +573,23 @@ func TestDeterministicOrderingInPipeline(t *testing.T) {
 
 	test.WriteConfig(t, configPath, config2.Config{
 		Formatters: map[string]*config2.Formatter{
-			// a and b should execute in lexicographical order as they have default priority 0, with c last since it has
-			// priority 1
+			// a and b have no priority set, which means they default to 0 and should execute first
+			// a and b should execute in lexicographical order
+			// c should execute first since it has a priority of 1
 			"fmt-a": {
 				Command:  "test-fmt",
 				Options:  []string{"fmt-a"},
 				Includes: []string{"*.py"},
-				Pipeline: "foo",
 			},
 			"fmt-b": {
 				Command:  "test-fmt",
 				Options:  []string{"fmt-b"},
 				Includes: []string{"*.py"},
-				Pipeline: "foo",
 			},
 			"fmt-c": {
 				Command:  "test-fmt",
 				Options:  []string{"fmt-c"},
 				Includes: []string{"*.py"},
-				Pipeline: "foo",
 				Priority: 1,
 			},
 		},
diff --git a/config/config.go b/config/config.go
index 2fb5f54..d9e281f 100644
--- a/config/config.go
+++ b/config/config.go
@@ -2,7 +2,6 @@ package config
 
 import (
 	"fmt"
-	"sort"
 
 	"github.com/BurntSushi/toml"
 )
@@ -13,7 +12,6 @@ type Config struct {
 		// Excludes is an optional list of glob patterns used to exclude certain files from all formatters.
 		Excludes []string
 	}
-	Names      []string              `toml:"-"`
 	Formatters map[string]*Formatter `toml:"formatter"`
 }
 
@@ -40,12 +38,5 @@ func ReadFile(path string, names []string) (cfg *Config, err error) {
 		cfg.Formatters = filtered
 	}
 
-	// sort the formatter names so that, as we construct pipelines, we add formatters in a determinstic fashion. This
-	// ensures a deterministic order even when all priority values are the same e.g. 0
-	for name := range cfg.Formatters {
-		cfg.Names = append(cfg.Names, name)
-	}
-	sort.Strings(cfg.Names)
-
 	return
 }
diff --git a/config/config_test.go b/config/config_test.go
index 6ad7493..a99f6f8 100644
--- a/config/config_test.go
+++ b/config/config_test.go
@@ -59,7 +59,6 @@ func TestReadConfigFile(t *testing.T) {
 	as.Nil(alejandra.Options)
 	as.Equal([]string{"*.nix"}, alejandra.Includes)
 	as.Equal([]string{"examples/nix/sources.nix"}, alejandra.Excludes)
-	as.Equal("nix", alejandra.Pipeline)
 	as.Equal(1, alejandra.Priority)
 
 	// deadnix
@@ -69,7 +68,6 @@ func TestReadConfigFile(t *testing.T) {
 	as.Nil(deadnix.Options)
 	as.Equal([]string{"*.nix"}, deadnix.Includes)
 	as.Nil(deadnix.Excludes)
-	as.Equal("nix", deadnix.Pipeline)
 	as.Equal(2, deadnix.Priority)
 
 	// ruby
diff --git a/config/formatter.go b/config/formatter.go
index 2c14c63..9e77e6a 100644
--- a/config/formatter.go
+++ b/config/formatter.go
@@ -9,8 +9,6 @@ type Formatter struct {
 	Includes []string
 	// Excludes is an optional list of glob patterns used to exclude certain files from this Formatter.
 	Excludes []string
-	// Indicates this formatter should be executed as part of a group of formatters all sharing the same pipeline key.
-	Pipeline string
-	// Indicates the order of precedence when executing as part of a pipeline.
+	// Indicates the order of precedence when executing this Formatter in a sequence of Formatters.
 	Priority int
 }
diff --git a/docs/configure.md b/docs/configure.md
index d0601af..2be0086 100644
--- a/docs/configure.md
+++ b/docs/configure.md
@@ -23,19 +23,19 @@ includes = ["*.go"]
 command = "black"
 includes = ["*.py"]
 
+# use the priority field to control the order of execution
+
 # run shellcheck first
 [formatter.shellcheck]
 command = "shellcheck"
 includes = ["*.sh"]
-pipeline = "sh"
-priority = 0
+priority = 0    # default is 0, but we set it here for clarity
 
 # shfmt second
 [formatter.shfmt]
 command = "shfmt"
 options = ["-s", "-w"]
 includes = ["*.sh"]
-pipeline = "sh"
 priority = 1
 ```
 
@@ -49,13 +49,21 @@ priority = 1
 -   `options` - an optional list of args to be passed to `command`.
 -   `includes` - a list of glob patterns used to determine whether the formatter should be applied against a given path.
 -   `excludes` - an optional list of glob patterns used to exclude certain files from this formatter.
--   `pipeline` - an optional key used to group related formatters together, ensuring they are executed sequentially
-    against a given path.
--   `priority` - indicates the order of execution when this formatter is operating as part of a pipeline. Greater
-    precedence is given to lower numbers, with the default being `0`.
+-   `priority` - influences the order of execution. Greater precedence is given to lower numbers, with the default being `0`.
 
-> When two or more formatters in a pipeline have the same priority, they are executed in lexicographical order to
-> ensure deterministic behaviour over multiple executions.
+## Same file, multiple formatters?
+
+For each file, `treefmt` determines a list of formatters based on the configured `includes` / `excludes` rules. This list is
+then sorted, first by priority (lower the value, higher the precedence) and secondly by formatter name (lexicographically).
+
+The resultant sequence of formatters is used to create a batch key, and similarly matched files get added to that batch
+until it is full, at which point the files are passed to each formatter in turn.
+
+This means that `treefmt` **guarantees only one formatter will be operating on a given file at any point in time**.
+Another consequence is that formatting is deterministic for a given file and a given `treefmt` configuration.
+
+By setting the priority fields appropriately, you can control the order in which those formatters are applied for any
+files they _both happen to match on_.
 
 ## Supported Formatters
 
diff --git a/format/formatter.go b/format/formatter.go
index dbe2f11..1db9164 100644
--- a/format/formatter.go
+++ b/format/formatter.go
@@ -40,43 +40,28 @@ func (f *Formatter) Executable() string {
 	return f.executable
 }
 
-func (f *Formatter) Apply(ctx context.Context, files []*walk.File, filter bool) error {
+func (f *Formatter) Name() string {
+	return f.name
+}
+
+func (f *Formatter) Priority() int {
+	return f.config.Priority
+}
+
+func (f *Formatter) Apply(ctx context.Context, tasks []*Task) error {
 	start := time.Now()
 
 	// construct args, starting with config
 	args := f.config.Options
 
-	// If filter is true it indicates we are executing as part of a pipeline.
-	// In such a scenario each formatter must sub filter the paths provided as different formatters might want different
-	// files in a pipeline.
-	if filter {
-		// reset the batch
-		f.batch = f.batch[:0]
+	// exit early if nothing to process
+	if len(tasks) == 0 {
+		return nil
+	}
 
-		// filter paths
-		for _, file := range files {
-			if f.Wants(file) {
-				f.batch = append(f.batch, file.RelPath)
-			}
-		}
-
-		// exit early if nothing to process
-		if len(f.batch) == 0 {
-			return nil
-		}
-
-		// append paths to the args
-		args = append(args, f.batch...)
-	} else {
-		// exit early if nothing to process
-		if len(files) == 0 {
-			return nil
-		}
-
-		// append paths to the args
-		for _, file := range files {
-			args = append(args, file.RelPath)
-		}
+	// append paths to the args
+	for _, task := range tasks {
+		args = append(args, task.File.RelPath)
 	}
 
 	// execute the command
@@ -95,7 +80,7 @@ func (f *Formatter) Apply(ctx context.Context, files []*walk.File, filter bool)
 
 	//
 
-	f.log.Infof("%v files processed in %v", len(files), time.Now().Sub(start))
+	f.log.Infof("%v files processed in %v", len(tasks), time.Now().Sub(start))
 
 	return nil
 }
@@ -136,10 +121,10 @@ func NewFormatter(
 	f.executable = executable
 
 	// initialise internal state
-	if cfg.Pipeline == "" {
-		f.log = log.WithPrefix(fmt.Sprintf("format | %s", name))
+	if cfg.Priority > 0 {
+		f.log = log.WithPrefix(fmt.Sprintf("format | %s[%d]", name, cfg.Priority))
 	} else {
-		f.log = log.WithPrefix(fmt.Sprintf("format | %s[%s]", cfg.Pipeline, name))
+		f.log = log.WithPrefix(fmt.Sprintf("format | %s", name))
 	}
 
 	f.includes, err = CompileGlobs(cfg.Includes)
diff --git a/format/pipeline.go b/format/pipeline.go
deleted file mode 100644
index af8b1c3..0000000
--- a/format/pipeline.go
+++ /dev/null
@@ -1,40 +0,0 @@
-package format
-
-import (
-	"context"
-	"slices"
-
-	"git.numtide.com/numtide/treefmt/walk"
-)
-
-type Pipeline struct {
-	sequence []*Formatter
-}
-
-func (p *Pipeline) Add(f *Formatter) {
-	p.sequence = append(p.sequence, f)
-	// sort by priority in ascending order
-	slices.SortFunc(p.sequence, func(a, b *Formatter) int {
-		return a.config.Priority - b.config.Priority
-	})
-}
-
-func (p *Pipeline) Wants(path *walk.File) bool {
-	var match bool
-	for _, f := range p.sequence {
-		match = f.Wants(path)
-		if match {
-			break
-		}
-	}
-	return match
-}
-
-func (p *Pipeline) Apply(ctx context.Context, paths []*walk.File) error {
-	for _, f := range p.sequence {
-		if err := f.Apply(ctx, paths, len(p.sequence) > 1); err != nil {
-			return err
-		}
-	}
-	return nil
-}
diff --git a/format/task.go b/format/task.go
new file mode 100644
index 0000000..062b454
--- /dev/null
+++ b/format/task.go
@@ -0,0 +1,42 @@
+package format
+
+import (
+	"cmp"
+	"slices"
+
+	"git.numtide.com/numtide/treefmt/walk"
+)
+
+type Task struct {
+	File       *walk.File
+	Formatters []*Formatter
+	BatchKey   string
+}
+
+func NewTask(file *walk.File, formatters []*Formatter) Task {
+	// sort by priority in ascending order
+	slices.SortFunc(formatters, func(a, b *Formatter) int {
+		priorityA := a.Priority()
+		priorityB := b.Priority()
+
+		result := priorityA - priorityB
+		if result == 0 {
+			// formatters with the same priority are sorted lexicographically to ensure a deterministic outcome
+			result = cmp.Compare(a.Name(), b.Name())
+		}
+		return result
+	})
+
+	// construct a batch key which represents the unique sequence of formatters to be applied to file
+	var key string
+	for _, f := range formatters {
+		key += f.name + ":"
+	}
+	key = key[:len(key)-1]
+
+	return Task{
+		File:       file,
+		Formatters: formatters,
+		BatchKey:   key,
+	}
+}
diff --git a/nix/treefmt.nix b/nix/treefmt.nix
index 4995e6c..cc50122 100644
--- a/nix/treefmt.nix
+++ b/nix/treefmt.nix
@@ -24,17 +24,14 @@
 
       settings.formatter = {
         deadnix = {
-          pipeline = "nix";
           priority = 1;
         };
 
         statix = {
-          pipeline = "nix";
           priority = 2;
         };
 
         alejandra = {
-          pipeline = "nix";
           priority = 3;
         };
 
diff --git a/test/examples/nixpkgs.toml b/test/examples/nixpkgs.toml
index 41401cd..e862f61 100644
--- a/test/examples/nixpkgs.toml
+++ b/test/examples/nixpkgs.toml
@@ -4,11 +4,9 @@
 command = "deadnix"
 options = ["--edit"]
 includes = ["*.nix"]
-pipeline = "nix"
 priority = 1
 
 [formatter.nixpkgs-fmt]
 command = "nixpkgs-fmt"
 includes = ["*.nix"]
-pipeline = "nix"
 priority = 2
\ No newline at end of file
diff --git a/test/examples/treefmt.toml b/test/examples/treefmt.toml
index 8d3ea62..8acc9eb 100644
--- a/test/examples/treefmt.toml
+++ b/test/examples/treefmt.toml
@@ -31,13 +31,11 @@ command = "alejandra"
 includes = ["*.nix"]
 # Act as an example on how to exclude specific files
 excludes = ["examples/nix/sources.nix"]
-pipeline = "nix"
 priority = 1
 
 [formatter.deadnix]
 command = "deadnix"
 includes = ["*.nix"]
-pipeline = "nix"
 priority = 2
 
 [formatter.ruby]