Merge branch 'main' of github.com:ncoop57/gpt-code-clippy into main

2024-08-16 10:20:28 +03:00 · 2021-07-20 20:24:33 +00:00 · 2021-07-20 20:24:33 +00:00 · 4d091e2b02
commit 4d091e2b02
parent 128220d4b7 f58668360c
5 changed files with 2080 additions and 254 deletions
--- a/code-clippy-app.ipynb
+++ b/code-clippy-app.ipynb
@ -0,0 +1,256 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "333a4f54-e120-4969-8adf-32b98655ff41",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2021-07-18 23:45:31.083087: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n",
+      "2021-07-18 23:45:31.083131: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import gradio as gr\n",
+    "\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "d875c6bc-8d97-4e03-9ee4-a4bd47f191bf",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/arto/transformers/src/transformers/modeling_flax_pytorch_utils.py:201: UserWarning: The given NumPy array is not writeable, and PyTorch does not support non-writeable tensors. This means you can write to the underlying (supposedly non-writeable) NumPy array using the tensor. You may want to copy the array to protect its data or make it writeable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at  /pytorch/torch/csrc/utils/tensor_numpy.cpp:180.)\n",
+      "  pt_model_dict[flax_key] = torch.from_numpy(flax_tensor)\n",
+      "All Flax model weights were used when initializing GPTNeoForCausalLM.\n",
+      "\n",
+      "Some weights of GPTNeoForCausalLM were not initialized from the Flax model and are newly initialized: ['transformer.h.4.attn.attention.bias', 'transformer.h.8.attn.attention.masked_bias', 'transformer.h.0.attn.attention.bias', 'transformer.h.2.attn.attention.masked_bias', 'transformer.h.2.attn.attention.bias', 'transformer.h.8.attn.attention.bias', 'transformer.h.10.attn.attention.masked_bias', 'transformer.h.11.attn.attention.masked_bias', 'transformer.h.3.attn.attention.masked_bias', 'transformer.h.9.attn.attention.masked_bias', 'transformer.h.6.attn.attention.masked_bias', 'transformer.h.7.attn.attention.masked_bias', 'transformer.h.6.attn.attention.bias', 'transformer.h.0.attn.attention.masked_bias', 'transformer.h.1.attn.attention.masked_bias', 'transformer.h.5.attn.attention.masked_bias', 'transformer.h.4.attn.attention.masked_bias', 'transformer.h.10.attn.attention.bias', 'lm_head.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    }
+   ],
+   "source": [
+    "model_name = \"/home/shared/models/gpt-code-clippy-125M-apps-lr-adam1e-4-bs128/ckpt-1633/\"\n",
+    "model = AutoModelForCausalLM.from_pretrained(model_name, from_flax=True)\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"EleutherAI/gpt-neo-125M\")\n",
+    "tokenizer.pad_token = tokenizer.eos_token"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "e6ebb017-e784-4311-a795-4bacd2263d19",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def format_input(question, starter_code=\"\"):\n",
+    "    answer_type = \"\\nUse Call-Based format\\n\" if starter_code else \"\\nUse Standard Input format\\n\"\n",
+    "    return f\"\\nQUESTION:\\n{question}\\n{starter_code}\\n{answer_type}\\nANSWER:\\n\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "b00ad3ab-3086-48f0-940a-3129a7dff30a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def format_outputs(text):\n",
+    "    formatted_text =f'''\n",
+    "    <head>\n",
+    "      <link rel=\"stylesheet\"\n",
+    "            href=\"https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.0.3/styles/default.min.css\">\n",
+    "      <script src=\"https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.0.3/highlight.min.js\"></script>\n",
+    "      <script>hljs.initHighlightingOnLoad();</script>\n",
+    "    </head>\n",
+    "    <body>\n",
+    "      <pre><code class=\"python\">{text}</code></pre>\n",
+    "    </body>\n",
+    "    '''\n",
+    "    return formatted_text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "28abdab5-962e-47af-83b2-4f9c491ba705",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_solution(question, starter_code=\"\", temperature=1., num_beams=1):\n",
+    "    prompt = format_input(question, starter_code)\n",
+    "    input_ids = tokenizer(prompt, return_tensors=\"pt\").input_ids\n",
+    "    start = len(input_ids[0])\n",
+    "    output = model.generate(\n",
+    "        input_ids,\n",
+    "        max_length=start+200,\n",
+    "        do_sample=True,\n",
+    "        top_p=0.95,\n",
+    "        pad_token_id=tokenizer.pad_token_id,\n",
+    "        early_stopping=True,\n",
+    "        temperature=1.,\n",
+    "        num_beams=int(num_beams),\n",
+    "        no_repeat_ngram_size=None,\n",
+    "        repetition_penalty=None,\n",
+    "        num_return_sequences=None,\n",
+    "    )\n",
+    "    \n",
+    "    return format_outputs(tokenizer.decode(output[0][start:]).strip())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "a156ff76-7e50-40c9-8b3e-e9ddeb450501",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "_EXAMPLES = [\n",
+    "    [\n",
+    "        \"\"\"\n",
+    "Given a 2D list of size `m * n`. Your task is to find the sum of minimum value in each row.\n",
+    "For Example:\n",
+    "```python\n",
+    "[\n",
+    "  [1, 2, 3, 4, 5],       # minimum value of row is 1\n",
+    "  [5, 6, 7, 8, 9],       # minimum value of row is 5\n",
+    "  [20, 21, 34, 56, 100]  # minimum value of row is 20\n",
+    "]\n",
+    "```\n",
+    "So, the function should return `26` because sum of minimums is as `1 + 5 + 20 = 26`\n",
+    "        \"\"\",\n",
+    "        \"\",\n",
+    "        0.8,\n",
+    "    ],\n",
+    "    [\n",
+    "        \"\"\"\n",
+    "# Personalized greeting\n",
+    "\n",
+    "Create a function that gives a personalized greeting. This function takes two parameters: `name` and `owner`.\n",
+    "        \"\"\",\n",
+    "        \"\"\"\n",
+    "Use conditionals to return the proper message:\n",
+    "\n",
+    "case| return\n",
+    "--- | ---\n",
+    "name equals owner | 'Hello boss'\n",
+    "otherwise         | 'Hello guest'\n",
+    "def greet(name, owner):\n",
+    "        \"\"\",\n",
+    "        0.8,\n",
+    "    ]\n",
+    "] "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "fb3d90fc-6932-4343-9c86-70ae94ca95aa",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running locally at: http://127.0.0.1:7861/\n",
+      "This share link will expire in 24 hours. If you need a permanent link, visit: https://gradio.app/introducing-hosted (NEW!)\n",
+      "Running on External URL: https://34711.gradio.app\n",
+      "Interface loading below...\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "        <iframe\n",
+       "            width=\"900\"\n",
+       "            height=\"500\"\n",
+       "            src=\"https://34711.gradio.app\"\n",
+       "            frameborder=\"0\"\n",
+       "            allowfullscreen\n",
+       "        ></iframe>\n",
+       "        "
+      ],
+      "text/plain": [
+       "<IPython.lib.display.IFrame at 0x7f0bf02db670>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(<Flask 'gradio.networking'>,\n",
+       " 'http://127.0.0.1:7861/',\n",
+       " 'https://34711.gradio.app')"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "inputs = [\n",
+    "    gr.inputs.Textbox(placeholder=\"Define a problem here...\", lines=7),\n",
+    "    gr.inputs.Textbox(placeholder=\"Provide optional starter code...\", lines=3),\n",
+    "    gr.inputs.Slider(0.5, 1.5, 0.1, default=0.8, label=\"Temperature\"),\n",
+    "    gr.inputs.Slider(1,4,1,default=1, label=\"Beam size\")\n",
+    "]\n",
+    "\n",
+    "outputs = [\n",
+    "    gr.outputs.HTML(label=\"Solution\")\n",
+    "]\n",
+    "\n",
+    "gr.Interface(\n",
+    "    generate_solution, \n",
+    "    inputs=inputs, \n",
+    "    outputs=outputs,\n",
+    "    title=\"Code Clippy: Problem Solver\",\n",
+    "    examples=_EXAMPLES,\n",
+    ").launch(share=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "737cd94c-5286-4832-9611-06e6f2a89357",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/demo_backup/app.py
+++ b/demo_backup/app.py
@ -0,0 +1,145 @@
+import gradio as gr
+
+from rich.console import Console
+from rich.syntax import Syntax
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import requests
+import json
+import webbrowser
+
+# model_name = "flax-community/gpt-code-clippy-1.3B-apps-alldata"
+model_name = "flax-community/gpt-code-clippy-125M-apps-alldata"
+model = AutoModelForCausalLM.from_pretrained(model_name)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+tokenizer.pad_token = tokenizer.eos_token
+
+console = Console(record=True)
+
+
+def format_input(question, starter_code=""):
+    answer_type = (
+        "\
+Use Call-Based format\
+" if starter_code else "\
+Use Standard Input format\
+"
+    )
+    return f"\
+QUESTION:\
+{question}\
+{starter_code}\
+{answer_type}\
+ANSWER:\
+"
+
+
+def format_outputs(text):
+    formatted_text = Syntax(
+        text, "python", line_numbers=True, indent_guides=True, word_wrap=True
+    )
+    console.print(formatted_text)
+
+    return console.export_html(inline_styles=True)
+
+
+def generate_solution(question, starter_code="", temperature=1.0, num_beams=1):
+    prompt = format_input(question, starter_code)
+    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
+    start = len(input_ids[0])
+    output = model.generate(
+        input_ids,
+        max_length=start + 200,
+        do_sample=True,
+        top_p=0.95,
+        pad_token_id=tokenizer.pad_token_id,
+        early_stopping=True,
+        temperature=temperature,
+        num_beams=int(num_beams),
+        no_repeat_ngram_size=None,
+        repetition_penalty=None,
+        num_return_sequences=None,
+    )
+
+    return format_outputs(
+        tokenizer.decode(output[0][start:], skip_special_tokens=True).strip()
+    )
+
+
+_EXAMPLES = [
+    [
+        """
+Given a 2D list of size `m * n`. Your task is to find the sum of minimum value in each row.
+For Example:
+```python
+[
+  [1, 2, 3, 4, 5],       # minimum value of row is 1
+  [5, 6, 7, 8, 9],       # minimum value of row is 5
+  [20, 21, 34, 56, 100]  # minimum value of row is 20
+]
+```
+So, the function should return `26` because sum of minimums is as `1 + 5 + 20 = 26`
+        """,
+        "",
+        0.8,
+    ],
+    [
+        """
+# Personalized greeting
+
+Create a function that gives a personalized greeting. This function takes two parameters: `name` and `owner`.
+        """,
+        """
+Use conditionals to return the proper message:
+
+case| return
+--- | ---
+name equals owner | 'Hello boss'
+otherwise         | 'Hello guest'
+def greet(name, owner):
+        """,
+        0.8,
+    ],
+]
+
+
+inputs = [
+    gr.inputs.Textbox(placeholder="Define a problem here...", lines=7),
+    gr.inputs.Textbox(placeholder="Provide optional starter code...", lines=3),
+    gr.inputs.Slider(0.5, 1.5, 0.1, default=0.8, label="Temperature"),
+    gr.inputs.Slider(1, 4, 1, default=1, label="Beam size"),
+    gr.inputs.Textbox(lines=1, label="Your GitHub API token")
+]
+
+outputs = [gr.outputs.HTML(label="Solution")]
+print(outputs)
+
+# adding carbon support
+
+GITHUB_API="https://api.github.com"
+API_TOKEN=gr.inputs.Textbox(label="Your GitHub API token")
+#form a request URL
+url=GITHUB_API+"/gists"
+
+#print headers,parameters,payload
+headers={'Authorization':'token %s'%API_TOKEN}
+params={'scope':'gist'}
+payload={outputs}
+
+
+
+res=requests.post(url,headers=headers,params=params,data=json.dumps(payload))
+
+
+col = st.beta_columns([2, 4])
+if col.button("Create a 'carbon' copy"):
+    carbon_url='https://carbon.now.sh/'+res.text.split(',')[0].split('/')[-1][:-1]
+    webbrowser.open_new(carbon_url)
+
+
+gr.Interface(
+    generate_solution,
+    inputs=inputs,
+    outputs=outputs,
+    title="Code Clippy: Problem Solver",
+    examples=_EXAMPLES,
+).launch(share=False)
--- a/finetune_apps.sh
+++ b/finetune_apps.sh
@ -1,6 +1,6 @@
 #! /bin/bash
 ./run_clm_apps.py \
-    --output_dir /home/shared/models/gpt-code-clippy-1.3B-apps \
+    --output_dir /home/shared/models/gpt-code-clippy-1.3B-apps-3 \
    --model_name_or_path EleutherAI/gpt-neo-1.3B \
    --dataset_name ./apps.py \
    --dataset_config_name formatted \
@ -24,11 +24,13 @@
    --skip_memory_metrics="False" \
    --save_steps="1000" \
    --save_strategy epoch \
-    --save_total_limit 2 \
+    --save_total_limit="None" \
    --gradient_accumulation_steps 1 \
    --adafactor true \
-    --all_data true \
+    --all_data false \
    --seed 842 \
+    --save_optimizer false \
+    --max_eval_samples 20000
    # --resume_from_checkpoint $HOME/gpt-neo-125M-code-clippy/ckpt_201 \
    # --max_train_samples="10000" \
-    # --max_eval_samples="1000"
+    
--- a/generate_apps.ipynb
+++ b/generate_apps.ipynb
@ -10,8 +10,8 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "2021-07-18 07:38:07.042553: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n",
-      "2021-07-18 07:38:07.042596: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n"
+      "2021-07-18 19:07:02.959520: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n",
+      "2021-07-18 19:07:02.959564: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n"
     ]
    }
   ],
@ -36,24 +36,38 @@
    }
   ],
   "source": [
-    "dataset = load_dataset(\"/home/arto/datasets/datasets/apps/apps.py\", \"formatted\", split=\"train\")"
+    "dataset = load_dataset(\"/home/arto/datasets/datasets/apps/apps.py\", \"formatted\", split=\"test\")"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 3,
   "id": "3811070c-c6a0-4a84-9362-cf7de0d5bd75",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/arto/transformers/src/transformers/modeling_flax_pytorch_utils.py:201: UserWarning: The given NumPy array is not writeable, and PyTorch does not support non-writeable tensors. This means you can write to the underlying (supposedly non-writeable) NumPy array using the tensor. You may want to copy the array to protect its data or make it writeable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at  /pytorch/torch/csrc/utils/tensor_numpy.cpp:180.)\n",
+      "  pt_model_dict[flax_key] = torch.from_numpy(flax_tensor)\n",
+      "All Flax model weights were used when initializing GPTNeoForCausalLM.\n",
+      "\n",
+      "Some weights of GPTNeoForCausalLM were not initialized from the Flax model and are newly initialized: ['transformer.h.0.attn.attention.bias', 'transformer.h.6.attn.attention.masked_bias', 'transformer.h.3.attn.attention.masked_bias', 'transformer.h.2.attn.attention.bias', 'transformer.h.10.attn.attention.masked_bias', 'transformer.h.9.attn.attention.masked_bias', 'transformer.h.1.attn.attention.masked_bias', 'transformer.h.7.attn.attention.masked_bias', 'transformer.h.2.attn.attention.masked_bias', 'lm_head.weight', 'transformer.h.10.attn.attention.bias', 'transformer.h.11.attn.attention.masked_bias', 'transformer.h.4.attn.attention.masked_bias', 'transformer.h.8.attn.attention.masked_bias', 'transformer.h.0.attn.attention.masked_bias', 'transformer.h.5.attn.attention.masked_bias', 'transformer.h.4.attn.attention.bias', 'transformer.h.6.attn.attention.bias', 'transformer.h.8.attn.attention.bias']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    }
+   ],
   "source": [
-    "# model = AutoModelForCausalLM.from_pretrained(\"/home/arto/gpt-code-clippy-lr1e-4-bs1024-f/ckpt-80000\",from_flax=True)\n",
+    "model = AutoModelForCausalLM.from_pretrained(\"/home/shared/models/gpt-code-clippy-125M-apps-lr-5e-5/ckpt-8169/\",from_flax=True)\n",
+    "# model = AutoModelForCausalLM.from_pretrained(\"EleutherAI/gpt-neo-125M\")\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"EleutherAI/gpt-neo-125M\")\n",
    "tokenizer.pad_token = tokenizer.eos_token"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 36,
   "id": "830a374f-a70d-466d-a766-3671e5c11765",
   "metadata": {},
   "outputs": [
@ -63,85 +77,70 @@
     "text": [
      "id :\n",
      "\n",
-      "6386\n",
+      "33905\n",
      "==========\n",
      "question :\n",
      "\n",
      "\n",
      "QUESTION:\n",
-      "Given an array arr.  You can choose a set of integers and remove all the occurrences of these integers in the array.\n",
-      "Return the minimum size of the set so that at least half of the integers of the array are removed.\n",
-      " \n",
-      "Example 1:\n",
-      "Input: arr = [3,3,3,3,5,5,5,2,2,7]\n",
-      "Output: 2\n",
-      "Explanation: Choosing {3,7} will make the new array [5,5,5,2,2] which has size 5 (i.e equal to half of the size of the old array).\n",
-      "Possible sets of size 2 are {3,5},{3,2},{5,2}.\n",
-      "Choosing set {2,7} is not possible as it will make the new array [3,3,3,3,5,5,5] which has size greater than half of the size of the old array.\n",
+      "There are N empty boxes arranged in a row from left to right.\n",
+      "The integer i is written on the i-th box from the left (1 \\leq i \\leq N).\n",
+      "For each of these boxes, Snuke can choose either to put a ball in it or to put nothing in it.\n",
+      "We say a set of choices to put a ball or not in the boxes is good when the following condition is satisfied:\n",
+      " - For every integer i between 1 and N (inclusive), the total number of balls contained in the boxes with multiples of i written on them is congruent to a_i modulo 2.\n",
+      "Does there exist a good set of choices? If the answer is yes, find one good set of choices.\n",
      "\n",
-      "Example 2:\n",
-      "Input: arr = [7,7,7,7,7,7]\n",
-      "Output: 1\n",
-      "Explanation: The only possible set you can choose is {7}. This will make the new array empty.\n",
+      "-----Constraints-----\n",
+      " - All values in input are integers.\n",
+      " - 1 \\leq N \\leq 2 \\times 10^5\n",
+      " - a_i is 0 or 1.\n",
      "\n",
-      "Example 3:\n",
-      "Input: arr = [1,9]\n",
-      "Output: 1\n",
+      "-----Input-----\n",
+      "Input is given from Standard Input in the following format:\n",
+      "N\n",
+      "a_1 a_2 ... a_N\n",
      "\n",
-      "Example 4:\n",
-      "Input: arr = [1000,1000,3,7]\n",
-      "Output: 1\n",
+      "-----Output-----\n",
+      "If a good set of choices does not exist, print -1.\n",
+      "If a good set of choices exists, print one such set of choices in the following format:\n",
+      "M\n",
+      "b_1 b_2 ... b_M\n",
      "\n",
-      "Example 5:\n",
-      "Input: arr = [1,2,3,4,5,6,7,8,9,10]\n",
-      "Output: 5\n",
+      "where M denotes the number of boxes that will contain a ball, and b_1,\\ b_2,\\ ...,\\ b_M are the integers written on these boxes, in any order.\n",
      "\n",
-      " \n",
-      "Constraints:\n",
+      "-----Sample Input-----\n",
+      "3\n",
+      "1 0 0\n",
      "\n",
-      "1 <= arr.length <= 10^5\n",
-      "arr.length is even.\n",
-      "1 <= arr[i] <= 10^5\n",
-      "class Solution:\n",
-      "    def minSetSize(self, arr: List[int]) -> int:\n",
-      "        \n",
+      "-----Sample Output-----\n",
+      "1\n",
+      "1\n",
      "\n",
-      "Use Call-Based format\n",
+      "Consider putting a ball only in the box with 1 written on it.\n",
+      " - There are three boxes with multiples of 1 written on them: the boxes with 1, 2, and 3. The total number of balls contained in these boxes is 1.\n",
+      " - There is only one box with a multiple of 2 written on it: the box with 2. The total number of balls contained in these boxes is 0.\n",
+      " - There is only one box with a multiple of 3 written on it: the box with 3. The total number of balls contained in these boxes is 0.\n",
+      "Thus, the condition is satisfied, so this set of choices is good.\n",
+      "\n",
+      "\n",
+      "Use Standard Input format\n",
      "\n",
      "ANSWER:\n",
      "\n",
      "==========\n",
      "answer :\n",
      "\n",
-      "class Solution:\n",
-      "\tdef minSetSize(self, arr: List[int]) -> int:\n",
-      "\t\t# get length of array \n",
-      "\t\tlength = len(arr)\n",
-      "\t\t# build dict to count how many times each int appears\n",
-      "\t\tcounts = {}\n",
-      "\t\tfor num in arr:\n",
-      "\t\t\tif num not in counts:\n",
-      "\t\t\t\tcounts[num] =1\n",
-      "\t\t\telse:\n",
-      "\t\t\t\tcounts[num] += 1\n",
-      "\t\t\t\t\n",
-      "\t\t# print(counts)\n",
-      "\t\t\n",
-      "\t\t# get values from dict, sort in descending order\n",
-      "\t\tdescending = sorted(counts.values(), reverse =  True)\n",
-      "\t\t# print(descending)\n",
-      "\t\t# initialize 2 variables: count and total\n",
-      "\t\tcount = 0\n",
-      "\t\ttotal = 0\n",
-      "\t\t# loop over descending list of counts\n",
-      "\t\tfor num in descending:\n",
-      "\t\t\t# add each number to our total\n",
-      "\t\t\ttotal += num\n",
-      "\t\t\t# increment count by 1\n",
-      "\t\t\tcount += 1\n",
-      "\t\t\t# if our total is half or more, return count\n",
-      "\t\t\tif total >= length/2:\n",
-      "\t\t\t\treturn count\n",
+      "N = int(input())\n",
+      "A = list(map(int,input().split()))\n",
+      "B = (N+1)*[0]\n",
+      "\n",
+      "for n in range(N,0,-1):\n",
+      "\tB[n] = (sum(B[n::n])+A[n-1])%2\n",
+      "\n",
+      "print(B.count(1))\n",
+      "for n in range(N+1):\n",
+      "\tif B[n]:\n",
+      "\t\tprint(n,end=\" \")\n",
      "\n",
      "==========\n"
     ]
@ -161,223 +160,332 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 11,
   "id": "1b109cec-c721-410f-b3df-38a50ae8607b",
   "metadata": {},
   "outputs": [],
   "source": [
-    "prompt = dataset[82239][\"question\"]"
+    "# prompt = dataset[82239][\"question\"]"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
-   "id": "820f8729-a368-4b41-91ff-56601d21aaaf",
+   "execution_count": 37,
+   "id": "8fb6f26e-bc50-40e6-b373-3d9e50993b22",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
+      "Using custom data configuration formatted-d9019fd8ed858445\n",
+      "Reusing dataset apps (/home/shared/.cache/hf/datasets/apps/formatted-d9019fd8ed858445/0.1.0/5987476458cc986e36654364319c6fe798b880d64a35518cbc00dc04f3c41e4d)\n"
     ]
-    },
+    }
+   ],
+   "source": [
+    "train_dataset = load_dataset(\"/home/arto/datasets/datasets/apps/apps.py\", \"formatted\", split=\"train\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "id": "0eb58b5c-4a3b-48cb-82e1-e68b792f806e",
+   "metadata": {},
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
+      "\n",
+      "QUESTION:\n",
+      "~~~if:csharp,javascript,cfml,php\n",
+      "Given a 2D array of size `m * n`. Your task is to find the sum of minimum value in each row.\n",
      "~~~\n",
+      "~~~if:cpp\n",
+      "Given a 2D vector of size `m * n`. Your task is to find the sum of minimum value in each row.\n",
      "~~~\n",
-      "~~~[\n",
-      "  [1, 2, 3, 4, 5, 6],       # smallest number is 1\n",
-      "  [4, 5, 6],                # largest number is 1\n",
-      "  [18, 17, 19, 20, 21, 22], # smallest number is 18\n",
+      "~~~if:python,ruby\n",
+      "Given a 2D list of size `m * n`. Your task is to find the sum of minimum value in each row.\n",
+      "~~~\n",
+      "\n",
+      "For Example:\n",
+      "```python\n",
+      "[\n",
+      "  [1, 2, 3, 4, 5],       # minimum value of row is 1\n",
+      "  [5, 6, 7, 8, 9],       # minimum value of row is 5\n",
+      "  [20, 21, 34, 56, 100]  # minimum value of row is 20\n",
      "]\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
-      "~~~\n",
+      "```\n",
+      "So, the function should return `26` because sum of minimums is as `1 + 5 + 20 = 26`\n",
+      "\n",
+      "~~~if:javascript,php\n",
+      "Note: You will be always given non-empty array containing Positive values.\n",
+      "~~~\n",
+      "~~~if:python\n",
+      "Note: You will be always given non-empty list containing Positive values.\n",
+      "~~~\n",
+      "~~~if:cpp\n",
+      "Note: You will be always given non-empty vector containing Positive values.\n",
+      "~~~\n",
+      "~~~if:c#\n",
+      "Note: You will be always given non-empty vector containing Positive values.\n",
+      "~~~\n",
+      "~~~if:cfml\n",
+      "Note: You will be always given non-empty array containing Positive values.\n",
+      "~~~\n",
+      "\n",
+      "ENJOY CODING :)\n",
+      "def sum_of_minimums(numbers):\n",
+      "\t\n",
+      "\n",
+      "Use Call-Based format\n",
+      "\n",
+      "ANSWER:\n",
      "\n"
     ]
    }
   ],
+   "source": [
+    "print(train_dataset[82239][\"question\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "id": "4decd052-f96b-4469-abfa-25beb9ab805c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "id :\n",
+      "\n",
+      "78360\n",
+      "==========\n",
+      "question :\n",
+      "\n",
+      "\n",
+      "QUESTION:\n",
+      "# Personalized greeting\n",
+      "\n",
+      "Create a function that gives a personalized greeting. This function takes two parameters: `name` and `owner`.\n",
+      "\n",
+      "Use conditionals to return the proper message:\n",
+      "\n",
+      "case | return\n",
+      "--- | ---\n",
+      "name equals owner | 'Hello boss'\n",
+      "otherwise         | 'Hello guest'\n",
+      "def greet(name, owner):\n",
+      "\t\n",
+      "\n",
+      "Use Call-Based format\n",
+      "\n",
+      "ANSWER:\n",
+      "\n",
+      "==========\n",
+      "answer :\n",
+      "\n",
+      "def greet(name, owner):\n",
+      "\tgreet = 'guest'\n",
+      "\tif name == owner: \n",
+      "\t\tgreet = 'boss'\n",
+      "\treturn 'Hello ' + greet\n",
+      "\n",
+      "==========\n"
+     ]
+    }
+   ],
+   "source": [
+    "import random\n",
+    "\n",
+    "id = random.randint(0, len(train_dataset)-1)\n",
+    "sample = train_dataset[id]\n",
+    "\n",
+    "for k, v in sample.items():\n",
+    "    print(k, \":\\n\")\n",
+    "    print(v)\n",
+    "    print(\"=\"*10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "d3be9853-0ce7-40a5-976e-6ff082a7ed2f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "115212"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(dataset)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eea91645-fbbc-4495-a829-357cd2833a6b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "id = 115212-10"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "2b72f491-8a44-4206-ba06-bc1bc6ba011c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompt = dataset[id][\"question\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "54768ef7-3752-45da-b5c8-721893c9b254",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "QUESTION:\n",
+      "Given an array of integers, return indices of the two numbers such that they add up to a specific target.\n",
+      "\n",
+      "You may assume that each input would have exactly one solution, and you may not use the same element twice.\n",
+      "\n",
+      "Example:\n",
+      "\n",
+      "\n",
+      "Given nums = [2, 7, 11, 15], target = 9,\n",
+      "\n",
+      "Because nums[0] + nums[1] = 2 + 7 = 9,\n",
+      "return [0, 1].\n",
+      "class Solution:\n",
+      "    def twoSum(self, nums: List[int], target: int) -> List[int]:\n",
+      "        \n",
+      "\n",
+      "Use Call-Based format\n",
+      "\n",
+      "ANSWER:\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(prompt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "820f8729-a368-4b41-91ff-56601d21aaaf",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "class Solution:\n",
+      "\tdef twoSum(self, nums, target):\n",
+      "\t    \"\"\"\n",
+      "\t    :type nums: List[int]\n",
+      "\t    :type target: int\n",
+      "\t    :rtype: List[int]\n",
+      "\t    \"\"\"\n",
+      "\t    a = {}\n",
+      "\t    b = {}\n",
+      "\t    for i in range(1, target+1):\n",
+      "\t\t   a[i] += nums[i]\n",
+      "\t         \n",
+      "\t    a.clear()\n",
+      "\t    for i in range(1, target+1):\n",
+      "\t\t   a.get(i, 0)\n",
+      "\t    return sorted(a)\n",
+      "\t\t\t\n",
+      "\t\t\n",
+      "\tdef helper(n):\n",
+      "\t    s = 1\n",
+      "\t    q ='sum'\n",
+      "\t    while n!= s:\n",
+      "\t\t   n += 1\n",
+      "\t\t   q +='sum'\n",
+      "\t\t   s += q\n",
+      "\t\t   n += 1\n",
+      "\t    return q\n"
+     ]
+    }
+   ],
   "source": [
    "input_ids = tokenizer(prompt, return_tensors=\"pt\").input_ids\n",
    "start = len(input_ids[0])\n",
    "output = model.generate(\n",
    "    input_ids,\n",
-    "    max_length=1000,\n",
+    "    max_length=start+400,\n",
    "    do_sample=True,\n",
    "    top_p=0.95,\n",
+    "    pad_token_id=tokenizer.pad_token_id,\n",
+    "    early_stopping=True,\n",
+    "    temperature=1.,\n",
+    "    no_repeat_ngram_size=None,\n",
+    "    repetition_penalty=None,\n",
+    "    num_return_sequences=None,\n",
    ")\n",
    "\n",
-    "print(tokenizer.decode(output[0][start:]))"
+    "print(tokenizer.decode(output[0][start:]).strip())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "e8d5ad26-1cd6-49bb-8f4c-550091bfcc9d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "import numpy as np\n",
+      "\n",
+      "H, N = list(map(int, input().split()))\n",
+      "\n",
+      "ab = []\n",
+      "for i in range(N):\n",
+      "\ta, b = list(map(int, input().split()))\n",
+      "\tab.append([a, b])\n",
+      "\n",
+      "ab = np.array(ab)\n",
+      "a_list = ab[:, 0]\n",
+      "b_list = ab[:, 1]\n",
+      "max_a = ab.max()\n",
+      "\n",
+      "inf = float('inf')\n",
+      "dp = np.array([inf for _ in range(H + max_a)])\n",
+      "dp[0] = 0\n",
+      "\n",
+      "for i in range(1, len(dp)):\n",
+      "\tdp[i] = np.amin(dp[np.maximum(i - a_list, 0)] + b_list)\n",
+      "\n",
+      "print((int(min(dp[H:]))))\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(dataset[id][\"answer\"])"
   ]
  },
  {
--- a/nbs/finetuning_gpt_code_clippy.ipynb
+++ b/nbs/finetuning_gpt_code_clippy.ipynb