autopopulate spellchecker (#1749)

Builds upon #1587.

Extract all symbol names that are not native to the current project and insert them into our own custom spell checking dictionary's "words" list.

The premise is that symbols that *are* native to our project should be spellchecked, but foreign symbols that constitute unrecognized dictionary words are presumably intentionally spelled that way.

# Convention

Manually-added words (i.e. for names in code that we've written for this project) will go into `.vscode/settings.json`.
The automatically generated word list from third-party packages goes into `cspell.json`.

# Usage

    scripts/spellcheck/autopopulate-spellchecker.sh
This commit is contained in:
Karl Ostmo 2024-01-27 17:54:13 -08:00 committed by GitHub
parent 42d4e54797
commit 9f5c165fd8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 1531 additions and 36 deletions

37
.vscode/settings.json vendored
View File

@ -16,92 +16,63 @@
"antiquotation",
"antiquoted",
"antiquoting",
"assocs",
"autoplay",
"Bifunctor",
"bimap",
"Blackhole",
"Blackholes",
"bquote",
"Bytestring",
"callsite",
"cata",
"CESK",
"Cmds",
"Colour",
"Combinators",
"conv",
"Corasick",
"datatypes",
"determinator",
"determinators",
"distrib",
"eithers",
"elems",
"ents",
"floorplan",
"Focusable",
"foldl",
"foldr",
"Gameplay",
"Hashable",
"hifi",
"HLINT",
"homomorphic",
"hsep",
"Ixed",
"JSONE",
"Keymap",
"kolor",
"leaderboard",
"mappend",
"mconcat",
"mempty",
"Monoid",
"multiset",
"multiworld",
"notif",
"objs",
"omni",
"Parameterizable",
"parens",
"pathfinding",
"Perlin",
"perp",
"playfield",
"Polytype",
"polytypes",
"pparens",
"prec",
"precomputation",
"Precompute",
"preprocess",
"prereq",
"prereqs",
"Prettyprinter",
"preuse",
"previewable",
"pushable",
"quasiquoter",
"quasiquoters",
"sconcat",
"reqs",
"scrollability",
"selfdestruct",
"Semigroup",
"snoc",
"Splittable",
"squote",
"SRGB",
"struct",
"Structs",
"Subdir",
"Substate",
"subrecord",
"subterms",
"subtyping",
"Subworld",
"subworlds",
"succ",
"reqs",
"teleporting",
"Tiebreaking",
"toplevel",
@ -112,13 +83,9 @@
"typeclass",
"ucata",
"Unchainable",
"uncurry",
"unequip",
"unfoldr",
"uniplate",
"unranked",
"Unwalkable",
"unwords",
"upperleft",
"vals",
"verbed",

1455
cspell.json Normal file

File diff suppressed because it is too large Load Diff

View File

@ -3,4 +3,4 @@
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd $SCRIPT_DIR/..
fourmolu --mode=inplace src app test
fourmolu --mode=inplace src app test scripts

View File

@ -0,0 +1,45 @@
#!/bin/bash -e
# This script lives 2 levels deep in the directory structure.
# Ensure its commands get run at the toplevel.
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd $SCRIPT_DIR/../..
# First, install hiedb:
#
# cabal install hiedb
#
# Then, generate *.hie files:
#
# cp hie.yaml.stack hie.yaml
# stack build --fast
DBNAME=hie.sqlite
hiedb --database $DBNAME index .hie
WORDS_TMPFILE_UNSORTED=$(mktemp --suffix .words)
# First, get all variable names that originate from external packages.
# Exclude names that contain symbols other than underscore or apostrophe or do not contain any letters.
sqlite3 $DBNAME "SELECT substr(sym, 3) FROM (SELECT DISTINCT occ AS sym FROM refs WHERE unit NOT IN (SELECT DISTINCT unit FROM mods)) ORDER BY sym" | grep -v "[^[:alnum:]_']" | grep "[[:alpha:]]" > $WORDS_TMPFILE_UNSORTED
# Next, append the individual "conids" extracted from all "modids" that originate from external packages.
# See definition of "conid" and "monid" here:
# https://www.haskell.org/onlinereport/haskell2010/haskellch5.html
sqlite3 $DBNAME "SELECT DISTINCT mod FROM refs WHERE unit NOT IN (SELECT DISTINCT unit FROM mods)" | scripts/spellcheck/split-module-names.hs >> $WORDS_TMPFILE_UNSORTED
WORDS_TMPFILE=$(mktemp --suffix .words)
sort -u $WORDS_TMPFILE_UNSORTED > $WORDS_TMPFILE
rm $WORDS_TMPFILE_UNSORTED
CSPELL_TMPFILE=$(mktemp --suffix .cspell)
CSPELL_FILEPATH=cspell.json
# Now, stuff the sorted list of names into the cspell JSON file word list.
jq '.words = $newWords' --slurpfile newWords <(jq --raw-input --null-input 'inputs' $WORDS_TMPFILE) $CSPELL_FILEPATH > $CSPELL_TMPFILE
# Overwrite the original with the modified version.
mv $CSPELL_TMPFILE $CSPELL_FILEPATH
rm $WORDS_TMPFILE

View File

@ -0,0 +1,28 @@
#!/usr/bin/env stack
{- stack script --resolver lts-21.25
--package data-ordlist
--package split
-}
import Data.List.Ordered (nubSort)
import Data.List.Split (splitOn)
-- |
-- Extracts all "conids" from a list of "modids".
-- (see definitions here: https://www.haskell.org/onlinereport/haskell2010/haskellch5.html )
--
-- E.g., takes lines of the following form:
--
-- Data.Text
-- Control.Arrow
--
-- and produces a flattened list of words:
--
-- Arrow
-- Control
-- Data
-- Text
splitParts = unlines . nubSort . concatMap (splitOn ".") . lines
main :: IO ()
main = interact splitParts