sentencepiece/python/add_new_vocab.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### You can add new special tokens to pre-trained sentencepiece model\n",
    "#### Run this code in google/sentencepiece/python/"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load pre-trained sentencepiece model\n",
    "Pre-trained model is needed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "371391"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import sentencepiece_model_pb2 as model\n",
    "m = model.ModelProto()\n",
    "m.ParseFromString(open(\"old.model\", \"rb\").read())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load tokens want to add\n",
    "Prepare the list of new tokens want to add"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['[UNK]',\n",
       " '[PAD]',\n",
       " '[CLS]',\n",
       " '[SEP]',\n",
       " '[MASK]',\n",
       " '[EOS]',\n",
       " '[DOMAIN]',\n",
       " '[SLOT]',\n",
       " '[ACTION]']"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "special_tokens = open(\"special_tokens.txt\", \"r\").read().split(\"\\n\")\n",
    "special_tokens"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Add new tokens to sentencepiece model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "for token in special_tokens:\n",
    "    new_token = model.ModelProto().SentencePiece()\n",
    "    new_token.piece = token\n",
    "    new_token.score = 0\n",
    "    m.pieces.append(new_token)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Save new sentencepiece model\n",
    "Load the new sentencepiece model to your NLP system"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('new.model', 'wb') as f:\n",
    "    f.write(m.SerializeToString())"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}