feat(server): make Coqui STT the default STT solution

2024-11-27 16:16:48 +03:00 · 2022-01-31 14:43:54 +08:00 · 2022-01-31 14:43:54 +08:00 · 7039918760
commit 7039918760
parent 79be099cc5
9 changed files with 10 additions and 177 deletions
--- a/.env.sample
+++ b/.env.sample
@ -17,7 +17,7 @@ LEON_AFTER_SPEECH=false
 # Enable/disable Leon's speech-to-text
 LEON_STT=false
 # Speech-to-text provider
-LEON_STT_PROVIDER=deepspeech
+LEON_STT_PROVIDER=coqui-stt

 # Enable/disable Leon's text-to-speech
 LEON_TTS=false
--- a/package-lock.json
+++ b/package-lock.json
@ -23,7 +23,6 @@
        "archiver": "^5.3.0",
        "async": "^3.2.0",
        "cross-env": "^7.0.3",
-        "deepspeech": "^0.9.3",
        "dotenv": "^10.0.0",
        "execa": "^5.0.0",
        "fastify": "^3.25.2",
@ -7989,21 +7988,6 @@
        "node": ">=0.10.0"
      }
    },
-    "node_modules/deepspeech": {
-      "version": "0.9.3",
-      "resolved": "https://registry.npmjs.org/deepspeech/-/deepspeech-0.9.3.tgz",
-      "integrity": "sha512-80yqUWEWgzclY9pjukpq0TdkQEbJ6NzqZ899vsLZfa4YcK35uOWIf+ILK55zQ+Ii/TEJ6Eo62Vc2saedQ/AK6w==",
-      "dependencies": {
-        "argparse": "1.0.x",
-        "memory-stream": "1.0.x",
-        "node-pre-gyp": "0.15.x",
-        "node-wav": "0.0.2",
-        "sox-stream": "2.0.x"
-      },
-      "bin": {
-        "deepspeech": "client.js"
-      }
-    },
    "node_modules/defaults": {
      "version": "1.0.3",
      "resolved": "https://registry.npmjs.org/defaults/-/defaults-1.0.3.tgz",
@ -26790,18 +26774,6 @@
      "resolved": "https://registry.npmjs.org/deepmerge/-/deepmerge-4.2.2.tgz",
      "integrity": "sha512-FJ3UgI4gIl+PHZm53knsuSFpE+nESMr7M4v9QcgB7S63Kj/6WqMiFQJpBBYz1Pt+66bZpP3Q7Lye0Oo9MPKEdg=="
    },
-    "deepspeech": {
-      "version": "0.9.3",
-      "resolved": "https://registry.npmjs.org/deepspeech/-/deepspeech-0.9.3.tgz",
-      "integrity": "sha512-80yqUWEWgzclY9pjukpq0TdkQEbJ6NzqZ899vsLZfa4YcK35uOWIf+ILK55zQ+Ii/TEJ6Eo62Vc2saedQ/AK6w==",
-      "requires": {
-        "argparse": "1.0.x",
-        "memory-stream": "1.0.x",
-        "node-pre-gyp": "0.15.x",
-        "node-wav": "0.0.2",
-        "sox-stream": "2.0.x"
-      }
-    },
    "defaults": {
      "version": "1.0.3",
      "resolved": "https://registry.npmjs.org/defaults/-/defaults-1.0.3.tgz",
--- a/package.json
+++ b/package.json
@ -66,7 +66,6 @@
    "archiver": "^5.3.0",
    "async": "^3.2.0",
    "cross-env": "^7.0.3",
-    "deepspeech": "^0.9.3",
    "dotenv": "^10.0.0",
    "execa": "^5.0.0",
    "fastify": "^3.25.2",
--- a/scripts/setup/preinstall.js
+++ b/scripts/setup/preinstall.js
@ -1,32 +1 @@
-const fs = require('fs')
-const path = require('path')
-const os = require('os')
-
-/**
- * Trigger preinstall hook to remove DeepSpeech on Windows
- */
-
 console.info('\x1b[36m➡ %s\x1b[0m', 'Running Leon\'s installation...')
-
-if (os.type().indexOf('Windows') !== -1) {
-  const packageJsonPath = path.join(__dirname, '../../package.json')
-  const packageJson = require(packageJsonPath) // eslint-disable-line global-require
-
-  console.warn('\x1b[33m❗ %s\x1b[0m', 'The Leon\'s voice offline mode is not available on Windows')
-  console.info('\x1b[36m➡ %s\x1b[0m', 'Backing up package.json...')
-  fs.copyFileSync('package.json', 'package.json.backup')
-  console.log('\x1b[32m✔ %s\x1b[0m', 'package.json has been backed up')
-
-  try {
-    if (packageJson?.dependencies.deepspeech) {
-      console.info('\x1b[36m➡ %s\x1b[0m', 'Removing DeepSpeech dependency...')
-
-      delete packageJson.dependencies.deepspeech
-      fs.writeFileSync(packageJsonPath, JSON.stringify(packageJson, null, 2))
-
-      console.log('\x1b[32m✔ %s\x1b[0m', 'DeepSpeech dependency has been removed.')
-    }
-  } catch (e) {
-    console.error('\x1b[31m✖ %s\x1b[0m', 'Failed to remove DeepSpeech dependency')
-  }
-}
--- a/scripts/setup/setup.js
+++ b/scripts/setup/setup.js
@ -1,8 +1,5 @@
-import fs from 'fs'
-
 import loader from '@/helpers/loader'
 import log from '@/helpers/log'
-import os from '@/helpers/os'

 import train from '../train'
 import setupDotenv from './setup-dotenv'
@ -17,8 +14,6 @@ import setupPythonPackages from './setup-python-packages'
 */
 (async () => {
  try {
-    const info = os.get()
-
    // Required env vars to setup
    process.env.LEON_LANG = 'en-US'
    process.env.PIPENV_PIPFILE = 'bridges/python/Pipfile'
@ -32,12 +27,6 @@ import setupPythonPackages from './setup-python-packages'
    ])
    await setupPythonPackages()
    await train()
-    if (info.type === 'windows') {
-      log.info('Windows detected, reinjecting DeepSpeech into package.json...')
-      fs.unlinkSync('package.json')
-      fs.renameSync('package.json.backup', 'package.json')
-      log.success('DeepSpeech has been reinjected into package.json')
-    }

    log.default('')
    log.success('Hooray! Leon is installed and ready to go!')
--- a/server/src/stt/deepspeech/parser.js
+++ b/server/src/stt/deepspeech/parser.js
@ -1,95 +0,0 @@
-import wav from 'node-wav'
-import fs from 'fs'
-
-import log from '@/helpers/log'
-
-log.title('DeepSpeech Parser')
-
-const parser = { }
-let DeepSpeech = { }
-
-/* istanbul ignore next */
-try {
-  DeepSpeech = require('deepspeech-gpu') // eslint-disable-line global-require, import/no-unresolved
-
-  log.success('GPU version found')
-} catch (eGpu) {
-  log.info('GPU version not found, trying to get the CPU version...')
-
-  try {
-    DeepSpeech = require('deepspeech') // eslint-disable-line global-require, import/no-unresolved
-
-    log.success('CPU version found')
-  } catch (eCpu) {
-    log.error(`No DeepSpeech library found:\nGPU: ${eGpu}\nCPU: ${eCpu}`)
-  }
-}
-
-let model = { }
-let desiredSampleRate = 16000
-
-/**
- * Model and language model paths
- */
-parser.conf = {
-  model: 'bin/deepspeech/deepspeech.pbmm',
-  scorer: 'bin/deepspeech/deepspeech.scorer'
-}
-
-/**
- * Load models
- */
-parser.init = (args) => {
-  /* istanbul ignore if */
-  if (process.env.LEON_LANG !== 'en-US') {
-    log.warning('The DeepSpeech parser only accepts the "en-US" language for the moment')
-  }
-
-  log.info(`Loading model from file ${args.model}...`)
-
-  if (!fs.existsSync(args.model)) {
-    log.error(`Cannot find ${args.model}. You can setup the offline STT by running: "npm run setup:offline-stt"`)
-
-    return false
-  }
-
-  if (!fs.existsSync(args.scorer)) {
-    log.error(`Cannot find ${args.scorer}. You can setup the offline STT by running: "npm run setup:offline-stt"`)
-
-    return false
-  }
-
-  /* istanbul ignore if */
-  if (process.env.LEON_NODE_ENV !== 'testing') {
-    model = new DeepSpeech.Model(args.model)
-    desiredSampleRate = model.sampleRate()
-
-    model.enableExternalScorer(args.scorer)
-  }
-
-  log.success('Model loaded')
-
-  return true
-}
-
-/**
- * Parse file and infer
- */
-parser.parse = (buffer, cb) => {
-  const wavDecode = wav.decode(buffer)
-
-  if (wavDecode.sampleRate < desiredSampleRate) {
-    log.warning(`Original sample rate (${wavDecode.sampleRate}) is lower than ${desiredSampleRate}Hz. Up-sampling might produce erratic speech recognition`)
-  }
-
-  /* istanbul ignore if */
-  if (process.env.LEON_NODE_ENV !== 'testing') {
-    const string = model.stt(buffer)
-
-    cb({ string })
-  }
-
-  return true
-}
-
-export default parser
--- a/server/src/stt/stt.js
+++ b/server/src/stt/stt.js
@ -8,7 +8,6 @@ class Stt {
    this.socket = socket
    this.provider = provider
    this.providers = [
-      'deepspeech',
      'google-cloud-stt',
      'watson-stt',
      'coqui-stt'
--- a/test/unit/server/stt/deepspeech/parser.spec.js
+++ b/test/unit/server/stt/deepspeech/parser.spec.js
@ -1,11 +1,11 @@
 import fs from 'fs'

-import parser from '@/stt/deepspeech/parser'
+import parser from '@/stt/coqui-stt/parser'

-describe('DeepSpeech STT parser', () => {
+describe('Coqui STT parser', () => {
  // Only run these tests if the models exist
-  if (fs.existsSync(`${global.paths.root}/bin/deepspeech/deepspeech.pbmm`)
-    && fs.existsSync(`${global.paths.root}/bin/deepspeech/deepspeech.scorer`)) {
+  if (fs.existsSync(`${global.paths.root}/bin/coqui/model.tflite`)
+    && fs.existsSync(`${global.paths.root}/bin/coqui/huge-vocabulary.scorer`)) {
    describe('init()', () => {
      test('returns error cannot find model', () => {
        expect(parser.init({
@ -15,15 +15,15 @@ describe('DeepSpeech STT parser', () => {

      test('returns error cannot find scorer', () => {
        expect(parser.init({
-          model: `${global.paths.root}/bin/deepspeech/deepspeech.pbmm`,
+          model: `${global.paths.root}/bin/coqui/model.tflite`,
          scorer: 'fake-scorer-path'
        })).toBeFalsy()
      })

      test('returns true because all of the paths are good', () => {
        expect(parser.init({
-          model: `${global.paths.root}/bin/deepspeech/deepspeech.pbmm`,
-          scorer: `${global.paths.root}/bin/deepspeech/deepspeech.scorer`
+          model: `${global.paths.root}/bin/coqui/model.tflite`,
+          scorer: `${global.paths.root}/bin/coqui/huge-vocabulary.scorer`
        })).toBeTruthy()
      })
    })
--- a/test/unit/server/stt/stt.spec.js
+++ b/test/unit/server/stt/stt.spec.js
@ -3,7 +3,7 @@ import Stt from '@/stt/stt'
 describe('STT', () => {
  describe('constructor()', () => {
    test('creates a new instance of Stt', () => {
-      const stt = new Stt({ }, 'deepspeech')
+      const stt = new Stt({ }, 'coqui-stt')

      expect(stt).toBeInstanceOf(Stt)
    })
@ -17,7 +17,7 @@ describe('STT', () => {
    })

    test('initializes the STT parser', () => {
-      const stt = new Stt({ }, 'deepspeech')
+      const stt = new Stt({ }, 'coqui-stt')

      expect(stt.init()).toBeTruthy()
    })