gpt-code-clippy/EDA.ipynb

{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  },
  "orig_nbformat": 4,
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3.8.10 64-bit ('.venv': venv)"
  },
  "interpreter": {
   "hash": "25d32b45eddbc1e23c07b06a2c9ff49f418e028170a37fc806346e2c2002bf83"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n"
     ]
    }
   ],
   "source": [
    "import pickle\n",
    "from collections import Counter\n",
    "from uuid import uuid4\n",
    "\n",
    "from tqdm import tqdm\n",
    "\n",
    "from datasets import load_dataset\n",
    "from transformers import AutoTokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "Using the latest cached version of the module from /home/shpotes/.cache/huggingface/modules/datasets_modules/datasets/code_clippy/86b09b4a623c1c39753a8ad165e05757d9a97daf132ac71d3b6eb791e7da16dd (last modified on Fri Jul  9 22:06:59 2021) since it couldn't be found locally at $HOME/gpt-code-clippy/code_clippy.py/code_clippy.py or remotely (FileNotFoundError).\n",
      "Using the latest cached version of the module from /home/shpotes/.cache/huggingface/modules/datasets_modules/datasets/code_clippy/86b09b4a623c1c39753a8ad165e05757d9a97daf132ac71d3b6eb791e7da16dd (last modified on Fri Jul  9 22:06:59 2021) since it couldn't be found locally at $HOME/gpt-code-clippy/code_clippy.py/code_clippy.py or remotely (FileNotFoundError).\n",
      "Using custom data configuration default-668fb26707140662\n",
      "Using the latest cached version of the module from /home/shpotes/.cache/huggingface/modules/datasets_modules/datasets/code_clippy/86b09b4a623c1c39753a8ad165e05757d9a97daf132ac71d3b6eb791e7da16dd (last modified on Fri Jul  9 22:06:59 2021) since it couldn't be found locally at $HOME/gpt-code-clippy/code_clippy.py/code_clippy.py or remotely (FileNotFoundError).\n",
      "Using the latest cached version of the module from /home/shpotes/.cache/huggingface/modules/datasets_modules/datasets/code_clippy/86b09b4a623c1c39753a8ad165e05757d9a97daf132ac71d3b6eb791e7da16dd (last modified on Fri Jul  9 22:06:59 2021) since it couldn't be found locally at $HOME/gpt-code-clippy/code_clippy.py/code_clippy.py or remotely (FileNotFoundError).\n",
      "Using custom data configuration default-668fb26707140662\n"
     ]
    }
   ],
   "source": [
    "train_dataset = load_dataset(\n",
    "    \"$HOME/gpt-code-clippy/code_clippy.py\",\n",
    "    data_dir=\"/home/shared/code-clippy-dataset/merged-data\",\n",
    "    streaming=True,\n",
    "    split=\"train\"\n",
    ")\n",
    "\n",
    "eval_dataset = load_dataset(\n",
    "    \"$HOME/gpt-code-clippy/code_clippy.py\",\n",
    "    data_dir=\"/home/shared/code-clippy-dataset/merged-data\",\n",
    "    streaming=True,\n",
    "    split=\"validation\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained(\"EleutherAI/gpt-neo-125M\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def _get_stats(example):\n",
    "    num_of_tokens = map(len, tokenizer(example[\"text\"])['input_ids'])\n",
    "    num_of_lines = map(lambda x: x.count(\"\\n\"), example[\"text\"])\n",
    "    file_name = map(lambda x: \".\".join(x.split(\".\")[:-1]), example[\"file_name\"])\n",
    "    langs = list(map(lambda x: x.split(\".\")[-1], example[\"file_name\"]))\n",
    "\n",
    "    lang_map = {}\n",
    "    acc_tok = []\n",
    "    acc_lines = []\n",
    "    acc_fnames = []\n",
    "\n",
    "    for tok, lines, fname, lang in zip(num_of_tokens, num_of_lines, file_name, langs):\n",
    "        if not lang in lang_map:\n",
    "            lang_idx = len(acc_tok)\n",
    "            lang_map[lang] = lang_idx\n",
    "\n",
    "            acc_tok.append([tok])\n",
    "            acc_lines.append([lines])\n",
    "            acc_fnames.append([Counter({fname: 1})])\n",
    "        else:\n",
    "            lang_idx = lang_map[lang]\n",
    "\n",
    "            acc_tok[lang_idx][0] += tok\n",
    "            acc_lines[lang_idx][0] += lines\n",
    "            acc_fnames[lang_idx][0].update({fname: 1})\n",
    "    \n",
    "    lang = [[k] for k, v in sorted(lang_map.items(), key=lambda item: item[1])]\n",
    "    _id = [str(uuid4())] * len(lang)\n",
    "\n",
    "    return {\n",
    "        \"ext\": lang,\n",
    "        \"id\": _id,\n",
    "        \"acc_num_of_tokens\": acc_tok,\n",
    "        \"acc_num_of_lines\": acc_lines,\n",
    "        \"acc_file_names\": acc_fnames,\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    " def collapse_metrics(per_language_dataset, target_file_name):\n",
    "    num_of_tokens = {}\n",
    "    num_of_lines = {}\n",
    "    file_names = {}\n",
    "\n",
    "    observed_lang = set()\n",
    "\n",
    "    for row in tqdm(per_language_dataset):\n",
    "        lang = row['ext'][0]\n",
    "        if lang in observed_lang:\n",
    "            num_of_tokens[lang] += row['acc_num_of_tokens'][0]\n",
    "            num_of_lines[lang] += row['acc_num_of_lines'][0]\n",
    "            file_names[lang].update(row['acc_file_names'][0])\n",
    "        else:\n",
    "            num_of_tokens[lang] = row['acc_num_of_tokens'][0]\n",
    "            num_of_lines[lang] = row['acc_num_of_lines'][0]\n",
    "            file_names[lang] = row['acc_file_names'][0]\n",
    "\n",
    "    with open(target_file_name, 'wb') as buf:\n",
    "        pickle.dump({\n",
    "            'num_of_tokens': num_of_tokens,\n",
    "            'num_of_lines': num_of_lines,\n",
    "            'file_names': file_names,\n",
    "        }, buf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_dataset = train_dataset.map(_get_stats, batched=True, batch_size=50_000)\n",
    "eval_dataset = eval_dataset.map(_get_stats, batched=True, batch_size=50_000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "0it [00:00, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2708 > 2048). Running this sequence through the model will result in indexing errors\n",
      "61it [00:55,  1.36it/s]"
     ]
    }
   ],
   "source": [
    "collapse_metrics(train_dataset, 'train_metrics.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "collapse_metrics(eval_dataset, 'eval_metrics.pkl')"
   ]
  }
 ]
}