feat(python tcp server): embed new text-to-speech engine in TCP server binary

2024-08-17 06:00:33 +03:00 · 2024-05-18 01:11:12 +08:00 · 2024-05-18 01:11:12 +08:00 · a7cab344f8
commit a7cab344f8
parent e455a9d96b
15 changed files with 77 additions and 78 deletions
--- a/package.json
+++ b/package.json
@ -54,7 +54,7 @@
    "build:nodejs-bridge": "tsx scripts/build-binaries.js nodejs-bridge",
    "build:python-bridge": "tsx scripts/build-binaries.js python-bridge",
    "build:tcp-server": "tsx scripts/build-binaries.js tcp-server",
-    "start:tcp-server": "cross-env PIPENV_PIPFILE=tcp_server/src/Pipfile pipenv run python tcp_server/src/main.py",
+    "start:tcp-server": "cross-env HF_HUB_VERBOSITY=debug PIPENV_PIPFILE=tcp_server/src/Pipfile pipenv run python tcp_server/src/main.py",
    "start": "cross-env LEON_NODE_ENV=production node server/dist/pre-check.js && node server/dist/index.js",
    "python-bridge": "cross-env PIPENV_PIPFILE=bridges/python/src/Pipfile pipenv run python bridges/python/src/main.py server/src/intent-object.sample.json",
    "train": "tsx scripts/train/run-train.js",
--- a/scripts/build-binaries.js
+++ b/scripts/build-binaries.js
@ -19,7 +19,6 @@ import {
 } from '@/constants'
 import { OSTypes } from '@/types'
 import { LogHelper } from '@/helpers/log-helper'
-import { LoaderHelper } from '@/helpers/loader-helper'
 import { SystemHelper } from '@/helpers/system-helper'

 /**
@ -57,8 +56,6 @@ BUILD_TARGETS.set('tcp-server', {
  dotVenvPath: path.join(PYTHON_TCP_SERVER_SRC_PATH, '.venv')
 })
 ;(async () => {
-  LoaderHelper.start()
-
  const { argv } = process
  const givenBuildTarget = argv[2].toLowerCase()

@ -117,8 +114,12 @@ BUILD_TARGETS.set('tcp-server', {
      process.env.PIPENV_PIPFILE = pipfilePath
      process.env.PIPENV_VENV_IN_PROJECT = true

+      /**
+       * cx_Freeze usage
+       * @see https://cx-freeze.readthedocs.io/en/latest/setup_script.html#build-exe
+       */
      await command(
-        `pipenv run python ${setupFilePath} build --build-exe ${buildPath}`,
+        `pipenv run python ${setupFilePath} build_exe --build-exe ${buildPath}`,
        {
          shell: true,
          stdio: 'inherit'
--- a/scripts/setup/setup-python-dev-env.js
+++ b/scripts/setup/setup-python-dev-env.js
@ -131,26 +131,37 @@ SPACY_MODELS.set('fr', {
  const hasDotVenv = fs.existsSync(dotVenvPath)
  const { type: osType, cpuArchitecture } = SystemHelper.getInformation()
  /**
-   * Install PyTorch nightly to support CUDA 12.4
-   * as it is required by the latest NVIDIA drivers for CUDA runtime APIs
+   * Install PyTorch with CUDA support
+   * as it is required by the latest NVIDIA drivers for CUDA runtime APIs.
+   * PyTorch will automatically download nvidia-* packages and bundle them.
   *
+   * It is important to specify the "--ignore-installed" flag to make sure the
+   * "~/.pyenv/versions/3.9.10/lib/python3.9/site-packages" is not used in case
+   * NVIDIA deps are already installed. Otherwise, it won't install it in our
+   * TCP server .venv as it is already installed (satisfied) in
+   * the path mentioned above
+   *
+   * @see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix
+   * @see https://pytorch.org/get-started/locally/
   * @see https://stackoverflow.com/a/76972265/1768162
+   * @see https://docs.nvidia.com/deeplearning/cudnn/latest/reference/support-matrix.html
   */
  const installPytorch = async () => {
-    LogHelper.info('Installing PyTorch nightly with CUDA support...')
+    LogHelper.info('Installing PyTorch with CUDA support...')
    try {
-      await command(
-        'pipenv run pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu124',
-        {
-          shell: true,
-          stdio: 'inherit'
-        }
-      )
-      LogHelper.success('PyTorch nightly with CUDA support installed')
+      // There is no CUDA support on macOS
+      const commandToExecute =
+        osType === OSTypes.MacOS
+          ? 'pipenv run pip install --ignore-installed torch==2.3.0'
+          : 'pipenv run pip install --ignore-installed torch==2.3.0 --index-url https://download.pytorch.org/whl/cu121'
+
+      await command(commandToExecute, {
+        shell: true,
+        stdio: 'inherit'
+      })
+      LogHelper.success('PyTorch with CUDA support installed')
    } catch (e) {
-      LogHelper.error(
-        `Failed to install PyTorch nightly with CUDA support: ${e}`
-      )
+      LogHelper.error(`Failed to install PyTorch with CUDA support: ${e}`)
      process.exit(1)
    }
  }
--- a/tcp_server/src/Pipfile
+++ b/tcp_server/src/Pipfile
@ -10,21 +10,16 @@ python_version = "3.9.10"
 # Setup
 setuptools = "*"
 wheel = "*"
-
 # Build
-cx-freeze = "==6.11.1"
-
+cx-freeze = "==7.0.0"
 # Common
 python-dotenv = "==0.19.2"
-
 # TCP server
 spacy = "==3.5.4"
 geonamescache = "==1.6.0"
-
 # TCP server; TTS
 # PyTorch is installed via the setup script
 # torch = "*"
-
 # TTS
 transformers = "==4.27.4"
 g2p-en = "==2.1.0"
--- a/tcp_server/src/lib/constants.py
+++ b/tcp_server/src/lib/constants.py
@ -1,6 +1,9 @@
 import os
+import sys

-SRC_PATH = os.path.join(os.getcwd(), 'tcp_server', 'src')
+IS_RAN_FROM_BINARY = getattr(sys, 'frozen', False)
+
+SRC_PATH = os.path.join(os.getcwd(), 'tcp_server', 'src') if not IS_RAN_FROM_BINARY else '.'

 # TTS
 TTS_MODEL_VERSION = 'V1'
--- a/tcp_server/src/lib/nlp.py
+++ b/tcp_server/src/lib/nlp.py
@ -1,6 +1,7 @@
 import copy
 from sys import argv
 import spacy
+import time
 from geonamescache import GeonamesCache

 lang = argv[1] or 'en'
@ -41,9 +42,12 @@ def load_spacy_model() -> None:
    model = spacy_model_mapping[lang]['model']
    exclude = spacy_model_mapping[lang]['exclude']

+    tic = time.perf_counter()
    log(f'Loading {model} spaCy model...')
    spacy_nlp = spacy.load(model, exclude=exclude)
    log('spaCy model loaded')
+    toc = time.perf_counter()
+    log(f"Time taken to load spaCy model: {toc - tic:0.4f} seconds")


 def delete_unneeded_country_data(data: dict) -> None:
--- a/tcp_server/src/lib/tcp_server.py
+++ b/tcp_server/src/lib/tcp_server.py
@ -2,6 +2,7 @@ import socket
 import json
 import os
 from typing import Union
+import time

 import lib.nlp as nlp
 from .tts.api import TTS
@ -47,7 +48,11 @@ class TCPServer:
        output_path = 'output.wav'
        speed = 1.0

+        tic = time.perf_counter()
        self.tts.tts_to_file(text, speaker_ids['EN-Leon-V1'], output_path, speed=speed)
+        toc = time.perf_counter()
+
+        self.log(f"Time taken to generate audio: {toc - tic:0.4f} seconds")

    def init(self):
        # Make sure to establish TCP connection by reusing the address so it does not conflict with port already in use
--- a/tcp_server/src/lib/tts/api.py
+++ b/tcp_server/src/lib/tts/api.py
@ -4,11 +4,14 @@ import numpy as np
 import torch.nn as nn
 from tqdm import tqdm
 import torch
+import time

 from . import utils
 from .models import SynthesizerTrn
 from .split_utils import split_sentence

+# torch.backends.cudnn.enabled = False
+
 class TTS(nn.Module):
    def __init__(self, 
                language,
@ -18,6 +21,7 @@ class TTS(nn.Module):
                ckpt_path=None):
        super().__init__()

+        tic = time.perf_counter()
        self.log('Loading model...')

        if device == 'auto':
@ -62,6 +66,9 @@ class TTS(nn.Module):
        self.language = 'ZH_MIX_EN' if language == 'ZH' else language # we support a ZH_MIX_EN model

        self.log('Model loaded')
+        toc = time.perf_counter()
+
+        self.log(f"Time taken to load model: {toc - tic:0.4f} seconds")

    @staticmethod
    def audio_numpy_concat(segment_data_list, sr, speed=1.):
--- a/tcp_server/src/lib/tts/attentions.py
+++ b/tcp_server/src/lib/tts/attentions.py
@ -24,16 +24,6 @@ class LayerNorm(nn.Module):
        return x.transpose(1, -1)


-@torch.jit.script
-def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
-    n_channels_int = n_channels[0]
-    in_act = input_a + input_b
-    t_act = torch.tanh(in_act[:, :n_channels_int, :])
-    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
-    acts = t_act * s_act
-    return acts
-
-
 class Encoder(nn.Module):
    def __init__(
        self,
--- a/tcp_server/src/lib/tts/commons.py
+++ b/tcp_server/src/lib/tts/commons.py
@ -97,16 +97,6 @@ def subsequent_mask(length):
    return mask


-@torch.jit.script
-def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
-    n_channels_int = n_channels[0]
-    in_act = input_a + input_b
-    t_act = torch.tanh(in_act[:, :n_channels_int, :])
-    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
-    acts = t_act * s_act
-    return acts
-
-
 def convert_pad_shape(pad_shape):
    layer = pad_shape[::-1]
    pad_shape = [item for sublist in layer for item in sublist]
--- a/tcp_server/src/lib/tts/modules.py
+++ b/tcp_server/src/lib/tts/modules.py
@ -182,33 +182,6 @@ class WN(torch.nn.Module):
            res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
            self.res_skip_layers.append(res_skip_layer)

-    def forward(self, x, x_mask, g=None, **kwargs):
-        output = torch.zeros_like(x)
-        n_channels_tensor = torch.IntTensor([self.hidden_channels])
-
-        if g is not None:
-            g = self.cond_layer(g)
-
-        for i in range(self.n_layers):
-            x_in = self.in_layers[i](x)
-            if g is not None:
-                cond_offset = i * 2 * self.hidden_channels
-                g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
-            else:
-                g_l = torch.zeros_like(x_in)
-
-            acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
-            acts = self.drop(acts)
-
-            res_skip_acts = self.res_skip_layers[i](acts)
-            if i < self.n_layers - 1:
-                res_acts = res_skip_acts[:, : self.hidden_channels, :]
-                x = (x + res_acts) * x_mask
-                output = output + res_skip_acts[:, self.hidden_channels :, :]
-            else:
-                output = output + res_skip_acts
-        return output * x_mask
-
    def remove_weight_norm(self):
        if self.gin_channels != 0:
            torch.nn.utils.remove_weight_norm(self.cond_layer)
--- a/tcp_server/src/lib/tts/text/english.py
+++ b/tcp_server/src/lib/tts/text/english.py
@ -2,6 +2,7 @@ import pickle
 import os
 import re
 from g2p_en import G2p
+from transformers import AutoTokenizer

 from . import symbols

@ -9,7 +10,6 @@ from .english_utils.abbreviations import expand_abbreviations
 from .english_utils.time_norm import expand_time_english
 from .english_utils.number_norm import normalize_numbers

-from transformers import AutoTokenizer

 current_file_path = os.path.dirname(__file__)
 CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
--- a/tcp_server/src/lib/tts/text/english_bert.py
+++ b/tcp_server/src/lib/tts/text/english_bert.py
@ -2,6 +2,7 @@ import torch
 from transformers import AutoTokenizer, AutoModelForMaskedLM
 import sys

+
 model_id = 'bert-base-uncased'
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = None
--- a/tcp_server/src/lib/tts/text/french.py
+++ b/tcp_server/src/lib/tts/text/french.py
@ -1,6 +1,7 @@
+from transformers import AutoTokenizer
+
 from .fr_phonemizer import cleaner as fr_cleaner
 from .fr_phonemizer import fr_to_ipa
-from transformers import AutoTokenizer


 def distribute_phone(n_phone, n_word):
--- a/tcp_server/src/setup.py
+++ b/tcp_server/src/setup.py
@ -1,27 +1,45 @@
 from cx_Freeze import setup, Executable
 import sysconfig
+import sys

 from version import __version__

+"""
+Increase the recursion limit to avoid RecursionError
+@see: https://github.com/marcelotduarte/cx_Freeze/issues/2240
+"""
+sys.setrecursionlimit(sys.getrecursionlimit() * 10)
+
+"""
+Instead of injecting everything from a package,
+it's recommended to only include the necessary files via the
+"include_files" property.
+"""
 options = {
    'build_exe': {
        'packages': [
            'spacy',
-            'torch',
            'en_core_web_trf',
-            'fr_core_news_md'
+            'fr_core_news_md',
+            'pycrfsuite'
        ],
        'includes': [
            'srsly.msgpack.util',
            'blis',
            'cymem'
+        ],
+        'include_files': [
+            ('tcp_server/src/.venv/lib/python3.9/site-packages/nvidia/cudnn/lib', 'lib/nvidia/cudnn/lib')
        ]
    }
 }

 # Include private libraries from the tokenizers package for Linux
 if 'linux' in sysconfig.get_platform():
-    options['build_exe']['include_files'] = [('tcp_server/src/.venv/lib/python3.9/site-packages/tokenizers.libs', 'lib/tokenizers.libs')]
+    options['build_exe']['include_files'] = [
+        *options['build_exe']['include_files'],
+        ('tcp_server/src/.venv/lib/python3.9/site-packages/tokenizers.libs', 'lib/tokenizers.libs')
+    ]

 executables = [
    Executable(