diff --git a/bin/coqui/.gitkeep b/bin/coqui/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/package.json b/package.json index 3a66122d..3fc2cfd7 100644 --- a/package.json +++ b/package.json @@ -78,6 +78,7 @@ "node-wav": "0.0.2", "socket.io": "^4.4.0", "socket.io-client": "^4.4.0", + "stt": "^1.1.0", "superagent": "^6.1.0" }, "devDependencies": { diff --git a/scripts/setup-offline/setup-stt.js b/scripts/setup-offline/setup-stt.js index 21896fb1..e35bcf40 100644 --- a/scripts/setup-offline/setup-stt.js +++ b/scripts/setup-offline/setup-stt.js @@ -10,23 +10,23 @@ import os from '@/helpers/os' export default () => new Promise(async (resolve, reject) => { log.info('Setting up offline speech-to-text...') - const destDeepSpeechFolder = 'bin/deepspeech' + const destCoquiFolder = 'bin/coqui' const tmpDir = 'scripts/tmp' - const deepSpeechVersion = '0.9.3' + const coquiVersion = '1.0.0' let downloader = 'wget' if (os.get().type === 'macos') { downloader = 'curl -L -O' } - if (!fs.existsSync(`${destDeepSpeechFolder}/deepspeech.scorer`)) { + if (!fs.existsSync(`${destCoquiFolder}/model.tflite`)) { try { log.info('Downloading pre-trained model...') - await command(`cd ${tmpDir} && ${downloader} https://github.com/mozilla/DeepSpeech/releases/download/v${deepSpeechVersion}/deepspeech-${deepSpeechVersion}-models.pbmm`, { shell: true }) - await command(`cd ${tmpDir} && ${downloader} https://github.com/mozilla/DeepSpeech/releases/download/v${deepSpeechVersion}/deepspeech-${deepSpeechVersion}-models.scorer`, { shell: true }) + await command(`cd ${tmpDir} && ${downloader} https://github.com/coqui-ai/STT-models/releases/download/english/coqui/v${coquiVersion}-huge-vocab/model.tflite`, { shell: true }) + await command(`cd ${tmpDir} && ${downloader} https://github.com/coqui-ai/STT-models/releases/download/english/coqui/v${coquiVersion}-huge-vocab/huge-vocabulary.scorer`, { shell: true }) log.success('Pre-trained model download done') log.info('Moving...') - await command(`mv -f ${tmpDir}/deepspeech-${deepSpeechVersion}-models.pbmm ${destDeepSpeechFolder}/deepspeech.pbmm`, { shell: true }) - await command(`mv -f ${tmpDir}/deepspeech-${deepSpeechVersion}-models.scorer ${destDeepSpeechFolder}/deepspeech.scorer`, { shell: true }) + await command(`mv -f ${tmpDir}/model.tflite ${destCoquiFolder}/model.tflite`, { shell: true }) + await command(`mv -f ${tmpDir}/huge-vocabulary.scorer ${destCoquiFolder}/huge-vocabulary.scorer`, { shell: true }) log.success('Move done') log.success('Offline speech-to-text installed') diff --git a/server/src/stt/coqui-stt/parser.js b/server/src/stt/coqui-stt/parser.js new file mode 100644 index 00000000..e83cf0c7 --- /dev/null +++ b/server/src/stt/coqui-stt/parser.js @@ -0,0 +1,98 @@ +import wav from 'node-wav' +import fs from 'fs' + +import log from '@/helpers/log' + +log.title('Coqui-ai Parser') + +const parser = { } +let STT = { } + +/* istanbul ignore next */ +try { + STT = require('stt-gpu') // eslint-disable-line global-require, import/no-unresolved + + log.success('GPU version found') +} catch (eGpu) { + log.info('GPU version not found, trying to get the CPU version...') + + try { + STT = require('stt') // eslint-disable-line global-require, import/no-unresolved + + log.success('CPU version found') + } catch (eCpu) { + log.error(`No Coqui-ai library found:\nGPU: ${eGpu}\nCPU: ${eCpu}`) + } +} + +let model = { } +let desiredSampleRate = 16000 + +/** + * Model and language model paths + */ +parser.conf = { + model: 'bin/coqui/model.tflite', + scorer: 'bin/coqui/huge-vocabulary.scorer' +} + +/** + * Load models + */ +parser.init = (args) => { + log.info(`Loading model from file ${args.model}...`) + + if (!fs.existsSync(args.model)) { + log.error(`Cannot find ${args.model}. You can setup the offline STT by running: "npm run setup:offline-stt"`) + + return false + } + + if (!fs.existsSync(args.scorer)) { + log.error(`Cannot find ${args.scorer}. You can setup the offline STT by running: "npm run setup:offline-stt"`) + + return false + } + + /* istanbul ignore if */ + if (process.env.LEON_NODE_ENV !== 'testing') { + try { + model = new STT.Model(args.model) + } catch (error) { + throw Error(`model.stt: ${error}`) + } + desiredSampleRate = model.sampleRate() + + try { + model.enableExternalScorer(args.scorer) + } catch (error) { + throw Error(`model.enableExternalScorer: ${error}`) + } + } + + log.success('Model loaded') + + return true +} + +/** + * Parse file and infer + */ +parser.parse = (buffer, cb) => { + const wavDecode = wav.decode(buffer) + + if (wavDecode.sampleRate < desiredSampleRate) { + log.warning(`Original sample rate (${wavDecode.sampleRate}) is lower than ${desiredSampleRate}Hz. Up-sampling might produce erratic speech recognition`) + } + + /* istanbul ignore if */ + if (process.env.LEON_NODE_ENV !== 'testing') { + const string = model.stt(buffer) + + cb({ string }) + } + + return true +} + +export default parser diff --git a/server/src/stt/stt.js b/server/src/stt/stt.js index 0df60954..9ef7adbe 100644 --- a/server/src/stt/stt.js +++ b/server/src/stt/stt.js @@ -10,7 +10,8 @@ class Stt { this.providers = [ 'deepspeech', 'google-cloud-stt', - 'watson-stt' + 'watson-stt', + 'coqui-stt' ] this.parser = { }