autopopulate spellchecker (#1749)

Builds upon #1587. Extract all symbol names that are not native to the current project and insert them into our own custom spell checking dictionary's "words" list. The premise is that symbols that *are* native to our project should be spellchecked, but foreign symbols that constitute unrecognized dictionary words are presumably intentionally spelled that way. # Convention Manually-added words (i.e. for names in code that we've written for this project) will go into `.vscode/settings.json`. The automatically generated word list from third-party packages goes into `cspell.json`. # Usage scripts/spellcheck/autopopulate-spellchecker.sh
2024-07-07 11:16:35 +03:00 · 2024-01-27 17:54:13 -08:00 · 2024-01-27 17:54:13 -08:00 · 9f5c165fd8
commit 9f5c165fd8
parent 42d4e54797
5 changed files with 1531 additions and 36 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -16,92 +16,63 @@
        "antiquotation",
        "antiquoted",
        "antiquoting",
-        "assocs",
        "autoplay",
-        "Bifunctor",
-        "bimap",
        "Blackhole",
        "Blackholes",
        "bquote",
        "Bytestring",
        "callsite",
-        "cata",
-        "CESK",
        "Cmds",
-        "Colour",
-        "Combinators",
        "conv",
        "Corasick",
        "datatypes",
        "determinator",
        "determinators",
-        "distrib",
        "eithers",
-        "elems",
        "ents",
        "floorplan",
        "Focusable",
-        "foldl",
-        "foldr",
        "Gameplay",
-        "Hashable",
        "hifi",
        "HLINT",
        "homomorphic",
-        "hsep",
-        "Ixed",
        "JSONE",
        "Keymap",
        "kolor",
        "leaderboard",
-        "mappend",
-        "mconcat",
-        "mempty",
-        "Monoid",
        "multiset",
        "multiworld",
+        "notif",
        "objs",
        "omni",
        "Parameterizable",
        "parens",
        "pathfinding",
-        "Perlin",
-        "perp",
        "playfield",
        "Polytype",
        "polytypes",
        "pparens",
        "prec",
        "precomputation",
-        "Precompute",
        "preprocess",
        "prereq",
        "prereqs",
-        "Prettyprinter",
-        "preuse",
        "previewable",
        "pushable",
        "quasiquoter",
        "quasiquoters",
-        "sconcat",
+        "reqs",
        "scrollability",
        "selfdestruct",
-        "Semigroup",
-        "snoc",
-        "Splittable",
        "squote",
-        "SRGB",
        "struct",
        "Structs",
        "Subdir",
-        "Substate",
        "subrecord",
        "subterms",
        "subtyping",
        "Subworld",
        "subworlds",
-        "succ",
-        "reqs",
        "teleporting",
        "Tiebreaking",
        "toplevel",
@ -112,13 +83,9 @@
        "typeclass",
        "ucata",
        "Unchainable",
-        "uncurry",
        "unequip",
-        "unfoldr",
-        "uniplate",
        "unranked",
        "Unwalkable",
-        "unwords",
        "upperleft",
        "vals",
        "verbed",
--- a/cspell.json
+++ b/cspell.json
--- a/scripts/reformat-code.sh
+++ b/scripts/reformat-code.sh
@ -3,4 +3,4 @@
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 cd $SCRIPT_DIR/..

-fourmolu --mode=inplace src app test
+fourmolu --mode=inplace src app test scripts
--- a/scripts/spellcheck/autopopulate-spellchecker.sh
+++ b/scripts/spellcheck/autopopulate-spellchecker.sh
@ -0,0 +1,45 @@
+#!/bin/bash -e
+
+# This script lives 2 levels deep in the directory structure.
+# Ensure its commands get run at the toplevel.
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+cd $SCRIPT_DIR/../..
+
+
+# First, install hiedb:
+#
+#    cabal install hiedb
+#
+# Then, generate *.hie files:
+#
+#    cp hie.yaml.stack hie.yaml
+#    stack build --fast
+
+DBNAME=hie.sqlite
+hiedb --database $DBNAME index .hie
+
+WORDS_TMPFILE_UNSORTED=$(mktemp --suffix .words)
+
+# First, get all variable names that originate from external packages.
+# Exclude names that contain symbols other than underscore or apostrophe or do not contain any letters.
+sqlite3 $DBNAME "SELECT substr(sym, 3) FROM (SELECT DISTINCT occ AS sym FROM refs WHERE unit NOT IN (SELECT DISTINCT unit FROM mods)) ORDER BY sym" | grep -v "[^[:alnum:]_']" | grep "[[:alpha:]]" > $WORDS_TMPFILE_UNSORTED
+
+# Next, append the individual "conids" extracted from all "modids" that originate from external packages.
+# See definition of "conid" and "monid" here:
+# https://www.haskell.org/onlinereport/haskell2010/haskellch5.html
+sqlite3 $DBNAME "SELECT DISTINCT mod FROM refs WHERE unit NOT IN (SELECT DISTINCT unit FROM mods)" | scripts/spellcheck/split-module-names.hs >> $WORDS_TMPFILE_UNSORTED
+
+WORDS_TMPFILE=$(mktemp --suffix .words)
+sort -u $WORDS_TMPFILE_UNSORTED > $WORDS_TMPFILE
+rm $WORDS_TMPFILE_UNSORTED
+
+CSPELL_TMPFILE=$(mktemp --suffix .cspell)
+CSPELL_FILEPATH=cspell.json
+
+# Now, stuff the sorted list of names into the cspell JSON file word list.
+jq '.words = $newWords' --slurpfile newWords <(jq --raw-input --null-input 'inputs' $WORDS_TMPFILE) $CSPELL_FILEPATH > $CSPELL_TMPFILE
+
+# Overwrite the original with the modified version.
+mv $CSPELL_TMPFILE $CSPELL_FILEPATH
+
+rm $WORDS_TMPFILE
--- a/scripts/spellcheck/split-module-names.hs
+++ b/scripts/spellcheck/split-module-names.hs
@ -0,0 +1,28 @@
+#!/usr/bin/env stack
+{- stack script --resolver lts-21.25
+  --package data-ordlist
+  --package split
+-}
+
+import Data.List.Ordered (nubSort)
+import Data.List.Split (splitOn)
+
+-- |
+-- Extracts all "conids" from a list of "modids".
+-- (see definitions here: https://www.haskell.org/onlinereport/haskell2010/haskellch5.html )
+--
+-- E.g., takes lines of the following form:
+--
+--    Data.Text
+--    Control.Arrow
+--
+-- and produces a flattened list of words:
+--
+--   Arrow
+--   Control
+--   Data
+--   Text
+splitParts = unlines . nubSort . concatMap (splitOn ".") . lines
+
+main :: IO ()
+main = interact splitParts