gpt-code-clippy/EDA.ipynb
Santiago cdbf1572d6 WIP: EDA
feat: add data preprocessing
TODO: implement keyword detection and deduplication stats
2021-07-13 13:26:10 +00:00

208 lines
7.3 KiB
Plaintext

{
"metadata": {
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
},
"orig_nbformat": 4,
"kernelspec": {
"name": "python3",
"display_name": "Python 3.8.10 64-bit ('.venv': venv)"
},
"interpreter": {
"hash": "25d32b45eddbc1e23c07b06a2c9ff49f418e028170a37fc806346e2c2002bf83"
}
},
"nbformat": 4,
"nbformat_minor": 2,
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n"
]
}
],
"source": [
"import pickle\n",
"from collections import Counter\n",
"from uuid import uuid4\n",
"\n",
"from tqdm import tqdm\n",
"\n",
"from datasets import load_dataset\n",
"from transformers import AutoTokenizer"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"Using the latest cached version of the module from /home/shpotes/.cache/huggingface/modules/datasets_modules/datasets/code_clippy/86b09b4a623c1c39753a8ad165e05757d9a97daf132ac71d3b6eb791e7da16dd (last modified on Fri Jul 9 22:06:59 2021) since it couldn't be found locally at $HOME/gpt-code-clippy/code_clippy.py/code_clippy.py or remotely (FileNotFoundError).\n",
"Using the latest cached version of the module from /home/shpotes/.cache/huggingface/modules/datasets_modules/datasets/code_clippy/86b09b4a623c1c39753a8ad165e05757d9a97daf132ac71d3b6eb791e7da16dd (last modified on Fri Jul 9 22:06:59 2021) since it couldn't be found locally at $HOME/gpt-code-clippy/code_clippy.py/code_clippy.py or remotely (FileNotFoundError).\n",
"Using custom data configuration default-668fb26707140662\n",
"Using the latest cached version of the module from /home/shpotes/.cache/huggingface/modules/datasets_modules/datasets/code_clippy/86b09b4a623c1c39753a8ad165e05757d9a97daf132ac71d3b6eb791e7da16dd (last modified on Fri Jul 9 22:06:59 2021) since it couldn't be found locally at $HOME/gpt-code-clippy/code_clippy.py/code_clippy.py or remotely (FileNotFoundError).\n",
"Using the latest cached version of the module from /home/shpotes/.cache/huggingface/modules/datasets_modules/datasets/code_clippy/86b09b4a623c1c39753a8ad165e05757d9a97daf132ac71d3b6eb791e7da16dd (last modified on Fri Jul 9 22:06:59 2021) since it couldn't be found locally at $HOME/gpt-code-clippy/code_clippy.py/code_clippy.py or remotely (FileNotFoundError).\n",
"Using custom data configuration default-668fb26707140662\n"
]
}
],
"source": [
"train_dataset = load_dataset(\n",
" \"$HOME/gpt-code-clippy/code_clippy.py\",\n",
" data_dir=\"/home/shared/code-clippy-dataset/merged-data\",\n",
" streaming=True,\n",
" split=\"train\"\n",
")\n",
"\n",
"eval_dataset = load_dataset(\n",
" \"$HOME/gpt-code-clippy/code_clippy.py\",\n",
" data_dir=\"/home/shared/code-clippy-dataset/merged-data\",\n",
" streaming=True,\n",
" split=\"validation\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"tokenizer = AutoTokenizer.from_pretrained(\"EleutherAI/gpt-neo-125M\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def _get_stats(example):\n",
" num_of_tokens = map(len, tokenizer(example[\"text\"])['input_ids'])\n",
" num_of_lines = map(lambda x: x.count(\"\\n\"), example[\"text\"])\n",
" file_name = map(lambda x: \".\".join(x.split(\".\")[:-1]), example[\"file_name\"])\n",
" langs = list(map(lambda x: x.split(\".\")[-1], example[\"file_name\"]))\n",
"\n",
" lang_map = {}\n",
" acc_tok = []\n",
" acc_lines = []\n",
" acc_fnames = []\n",
"\n",
" for tok, lines, fname, lang in zip(num_of_tokens, num_of_lines, file_name, langs):\n",
" if not lang in lang_map:\n",
" lang_idx = len(acc_tok)\n",
" lang_map[lang] = lang_idx\n",
"\n",
" acc_tok.append([tok])\n",
" acc_lines.append([lines])\n",
" acc_fnames.append([Counter({fname: 1})])\n",
" else:\n",
" lang_idx = lang_map[lang]\n",
"\n",
" acc_tok[lang_idx][0] += tok\n",
" acc_lines[lang_idx][0] += lines\n",
" acc_fnames[lang_idx][0].update({fname: 1})\n",
" \n",
" lang = [[k] for k, v in sorted(lang_map.items(), key=lambda item: item[1])]\n",
" _id = [str(uuid4())] * len(lang)\n",
"\n",
" return {\n",
" \"ext\": lang,\n",
" \"id\": _id,\n",
" \"acc_num_of_tokens\": acc_tok,\n",
" \"acc_num_of_lines\": acc_lines,\n",
" \"acc_file_names\": acc_fnames,\n",
" }"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
" def collapse_metrics(per_language_dataset, target_file_name):\n",
" num_of_tokens = {}\n",
" num_of_lines = {}\n",
" file_names = {}\n",
"\n",
" observed_lang = set()\n",
"\n",
" for row in tqdm(per_language_dataset):\n",
" lang = row['ext'][0]\n",
" if lang in observed_lang:\n",
" num_of_tokens[lang] += row['acc_num_of_tokens'][0]\n",
" num_of_lines[lang] += row['acc_num_of_lines'][0]\n",
" file_names[lang].update(row['acc_file_names'][0])\n",
" else:\n",
" num_of_tokens[lang] = row['acc_num_of_tokens'][0]\n",
" num_of_lines[lang] = row['acc_num_of_lines'][0]\n",
" file_names[lang] = row['acc_file_names'][0]\n",
"\n",
" with open(target_file_name, 'wb') as buf:\n",
" pickle.dump({\n",
" 'num_of_tokens': num_of_tokens,\n",
" 'num_of_lines': num_of_lines,\n",
" 'file_names': file_names,\n",
" }, buf)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"train_dataset = train_dataset.map(_get_stats, batched=True, batch_size=50_000)\n",
"eval_dataset = eval_dataset.map(_get_stats, batched=True, batch_size=50_000)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"0it [00:00, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2708 > 2048). Running this sequence through the model will result in indexing errors\n",
"61it [00:55, 1.36it/s]"
]
}
],
"source": [
"collapse_metrics(train_dataset, 'train_metrics.pkl')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"collapse_metrics(eval_dataset, 'eval_metrics.pkl')"
]
}
]
}