gpt-code-clippy/generate_apps.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "7b529017-a33e-4b2d-be5b-387ae7dd51e1",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2021-07-18 19:07:02.959520: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n",
      "2021-07-18 19:07:02.959564: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
    "from datasets import load_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "53610604-dff1-4a65-83b5-0795d5b3b473",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using custom data configuration formatted-d9019fd8ed858445\n",
      "Reusing dataset apps (/home/shared/.cache/hf/datasets/apps/formatted-d9019fd8ed858445/0.1.0/5987476458cc986e36654364319c6fe798b880d64a35518cbc00dc04f3c41e4d)\n"
     ]
    }
   ],
   "source": [
    "dataset = load_dataset(\"/home/arto/datasets/datasets/apps/apps.py\", \"formatted\", split=\"test\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "3811070c-c6a0-4a84-9362-cf7de0d5bd75",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/arto/transformers/src/transformers/modeling_flax_pytorch_utils.py:201: UserWarning: The given NumPy array is not writeable, and PyTorch does not support non-writeable tensors. This means you can write to the underlying (supposedly non-writeable) NumPy array using the tensor. You may want to copy the array to protect its data or make it writeable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at  /pytorch/torch/csrc/utils/tensor_numpy.cpp:180.)\n",
      "  pt_model_dict[flax_key] = torch.from_numpy(flax_tensor)\n",
      "All Flax model weights were used when initializing GPTNeoForCausalLM.\n",
      "\n",
      "Some weights of GPTNeoForCausalLM were not initialized from the Flax model and are newly initialized: ['transformer.h.0.attn.attention.bias', 'transformer.h.6.attn.attention.masked_bias', 'transformer.h.3.attn.attention.masked_bias', 'transformer.h.2.attn.attention.bias', 'transformer.h.10.attn.attention.masked_bias', 'transformer.h.9.attn.attention.masked_bias', 'transformer.h.1.attn.attention.masked_bias', 'transformer.h.7.attn.attention.masked_bias', 'transformer.h.2.attn.attention.masked_bias', 'lm_head.weight', 'transformer.h.10.attn.attention.bias', 'transformer.h.11.attn.attention.masked_bias', 'transformer.h.4.attn.attention.masked_bias', 'transformer.h.8.attn.attention.masked_bias', 'transformer.h.0.attn.attention.masked_bias', 'transformer.h.5.attn.attention.masked_bias', 'transformer.h.4.attn.attention.bias', 'transformer.h.6.attn.attention.bias', 'transformer.h.8.attn.attention.bias']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    }
   ],
   "source": [
    "model = AutoModelForCausalLM.from_pretrained(\"/home/shared/models/gpt-code-clippy-125M-apps-lr-5e-5/ckpt-8169/\",from_flax=True)\n",
    "# model = AutoModelForCausalLM.from_pretrained(\"EleutherAI/gpt-neo-125M\")\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"EleutherAI/gpt-neo-125M\")\n",
    "tokenizer.pad_token = tokenizer.eos_token"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "830a374f-a70d-466d-a766-3671e5c11765",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "id :\n",
      "\n",
      "33905\n",
      "==========\n",
      "question :\n",
      "\n",
      "\n",
      "QUESTION:\n",
      "There are N empty boxes arranged in a row from left to right.\n",
      "The integer i is written on the i-th box from the left (1 \\leq i \\leq N).\n",
      "For each of these boxes, Snuke can choose either to put a ball in it or to put nothing in it.\n",
      "We say a set of choices to put a ball or not in the boxes is good when the following condition is satisfied:\n",
      " - For every integer i between 1 and N (inclusive), the total number of balls contained in the boxes with multiples of i written on them is congruent to a_i modulo 2.\n",
      "Does there exist a good set of choices? If the answer is yes, find one good set of choices.\n",
      "\n",
      "-----Constraints-----\n",
      " - All values in input are integers.\n",
      " - 1 \\leq N \\leq 2 \\times 10^5\n",
      " - a_i is 0 or 1.\n",
      "\n",
      "-----Input-----\n",
      "Input is given from Standard Input in the following format:\n",
      "N\n",
      "a_1 a_2 ... a_N\n",
      "\n",
      "-----Output-----\n",
      "If a good set of choices does not exist, print -1.\n",
      "If a good set of choices exists, print one such set of choices in the following format:\n",
      "M\n",
      "b_1 b_2 ... b_M\n",
      "\n",
      "where M denotes the number of boxes that will contain a ball, and b_1,\\ b_2,\\ ...,\\ b_M are the integers written on these boxes, in any order.\n",
      "\n",
      "-----Sample Input-----\n",
      "3\n",
      "1 0 0\n",
      "\n",
      "-----Sample Output-----\n",
      "1\n",
      "1\n",
      "\n",
      "Consider putting a ball only in the box with 1 written on it.\n",
      " - There are three boxes with multiples of 1 written on them: the boxes with 1, 2, and 3. The total number of balls contained in these boxes is 1.\n",
      " - There is only one box with a multiple of 2 written on it: the box with 2. The total number of balls contained in these boxes is 0.\n",
      " - There is only one box with a multiple of 3 written on it: the box with 3. The total number of balls contained in these boxes is 0.\n",
      "Thus, the condition is satisfied, so this set of choices is good.\n",
      "\n",
      "\n",
      "Use Standard Input format\n",
      "\n",
      "ANSWER:\n",
      "\n",
      "==========\n",
      "answer :\n",
      "\n",
      "N = int(input())\n",
      "A = list(map(int,input().split()))\n",
      "B = (N+1)*[0]\n",
      "\n",
      "for n in range(N,0,-1):\n",
      "\tB[n] = (sum(B[n::n])+A[n-1])%2\n",
      "\n",
      "print(B.count(1))\n",
      "for n in range(N+1):\n",
      "\tif B[n]:\n",
      "\t\tprint(n,end=\" \")\n",
      "\n",
      "==========\n"
     ]
    }
   ],
   "source": [
    "import random\n",
    "\n",
    "id = random.randint(0, len(dataset)-1)\n",
    "sample = dataset[id]\n",
    "\n",
    "for k, v in sample.items():\n",
    "    print(k, \":\\n\")\n",
    "    print(v)\n",
    "    print(\"=\"*10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "1b109cec-c721-410f-b3df-38a50ae8607b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# prompt = dataset[82239][\"question\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "8fb6f26e-bc50-40e6-b373-3d9e50993b22",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using custom data configuration formatted-d9019fd8ed858445\n",
      "Reusing dataset apps (/home/shared/.cache/hf/datasets/apps/formatted-d9019fd8ed858445/0.1.0/5987476458cc986e36654364319c6fe798b880d64a35518cbc00dc04f3c41e4d)\n"
     ]
    }
   ],
   "source": [
    "train_dataset = load_dataset(\"/home/arto/datasets/datasets/apps/apps.py\", \"formatted\", split=\"train\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "0eb58b5c-4a3b-48cb-82e1-e68b792f806e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "QUESTION:\n",
      "~~~if:csharp,javascript,cfml,php\n",
      "Given a 2D array of size `m * n`. Your task is to find the sum of minimum value in each row.\n",
      "~~~\n",
      "~~~if:cpp\n",
      "Given a 2D vector of size `m * n`. Your task is to find the sum of minimum value in each row.\n",
      "~~~\n",
      "~~~if:python,ruby\n",
      "Given a 2D list of size `m * n`. Your task is to find the sum of minimum value in each row.\n",
      "~~~\n",
      "\n",
      "For Example:\n",
      "```python\n",
      "[\n",
      "  [1, 2, 3, 4, 5],       # minimum value of row is 1\n",
      "  [5, 6, 7, 8, 9],       # minimum value of row is 5\n",
      "  [20, 21, 34, 56, 100]  # minimum value of row is 20\n",
      "]\n",
      "```\n",
      "So, the function should return `26` because sum of minimums is as `1 + 5 + 20 = 26`\n",
      "\n",
      "~~~if:javascript,php\n",
      "Note: You will be always given non-empty array containing Positive values.\n",
      "~~~\n",
      "~~~if:python\n",
      "Note: You will be always given non-empty list containing Positive values.\n",
      "~~~\n",
      "~~~if:cpp\n",
      "Note: You will be always given non-empty vector containing Positive values.\n",
      "~~~\n",
      "~~~if:c#\n",
      "Note: You will be always given non-empty vector containing Positive values.\n",
      "~~~\n",
      "~~~if:cfml\n",
      "Note: You will be always given non-empty array containing Positive values.\n",
      "~~~\n",
      "\n",
      "ENJOY CODING :)\n",
      "def sum_of_minimums(numbers):\n",
      "\t\n",
      "\n",
      "Use Call-Based format\n",
      "\n",
      "ANSWER:\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(train_dataset[82239][\"question\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "4decd052-f96b-4469-abfa-25beb9ab805c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "id :\n",
      "\n",
      "78360\n",
      "==========\n",
      "question :\n",
      "\n",
      "\n",
      "QUESTION:\n",
      "# Personalized greeting\n",
      "\n",
      "Create a function that gives a personalized greeting. This function takes two parameters: `name` and `owner`.\n",
      "\n",
      "Use conditionals to return the proper message:\n",
      "\n",
      "case | return\n",
      "--- | ---\n",
      "name equals owner | 'Hello boss'\n",
      "otherwise         | 'Hello guest'\n",
      "def greet(name, owner):\n",
      "\t\n",
      "\n",
      "Use Call-Based format\n",
      "\n",
      "ANSWER:\n",
      "\n",
      "==========\n",
      "answer :\n",
      "\n",
      "def greet(name, owner):\n",
      "\tgreet = 'guest'\n",
      "\tif name == owner: \n",
      "\t\tgreet = 'boss'\n",
      "\treturn 'Hello ' + greet\n",
      "\n",
      "==========\n"
     ]
    }
   ],
   "source": [
    "import random\n",
    "\n",
    "id = random.randint(0, len(train_dataset)-1)\n",
    "sample = train_dataset[id]\n",
    "\n",
    "for k, v in sample.items():\n",
    "    print(k, \":\\n\")\n",
    "    print(v)\n",
    "    print(\"=\"*10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "d3be9853-0ce7-40a5-976e-6ff082a7ed2f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "115212"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eea91645-fbbc-4495-a829-357cd2833a6b",
   "metadata": {},
   "outputs": [],
   "source": [
    "id = 115212-10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "2b72f491-8a44-4206-ba06-bc1bc6ba011c",
   "metadata": {},
   "outputs": [],
   "source": [
    "prompt = dataset[id][\"question\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "54768ef7-3752-45da-b5c8-721893c9b254",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "QUESTION:\n",
      "Given an array of integers, return indices of the two numbers such that they add up to a specific target.\n",
      "\n",
      "You may assume that each input would have exactly one solution, and you may not use the same element twice.\n",
      "\n",
      "Example:\n",
      "\n",
      "\n",
      "Given nums = [2, 7, 11, 15], target = 9,\n",
      "\n",
      "Because nums[0] + nums[1] = 2 + 7 = 9,\n",
      "return [0, 1].\n",
      "class Solution:\n",
      "    def twoSum(self, nums: List[int], target: int) -> List[int]:\n",
      "        \n",
      "\n",
      "Use Call-Based format\n",
      "\n",
      "ANSWER:\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(prompt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "820f8729-a368-4b41-91ff-56601d21aaaf",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "class Solution:\n",
      "\tdef twoSum(self, nums, target):\n",
      "\t    \"\"\"\n",
      "\t    :type nums: List[int]\n",
      "\t    :type target: int\n",
      "\t    :rtype: List[int]\n",
      "\t    \"\"\"\n",
      "\t    a = {}\n",
      "\t    b = {}\n",
      "\t    for i in range(1, target+1):\n",
      "\t\t   a[i] += nums[i]\n",
      "\t         \n",
      "\t    a.clear()\n",
      "\t    for i in range(1, target+1):\n",
      "\t\t   a.get(i, 0)\n",
      "\t    return sorted(a)\n",
      "\t\t\t\n",
      "\t\t\n",
      "\tdef helper(n):\n",
      "\t    s = 1\n",
      "\t    q ='sum'\n",
      "\t    while n!= s:\n",
      "\t\t   n += 1\n",
      "\t\t   q +='sum'\n",
      "\t\t   s += q\n",
      "\t\t   n += 1\n",
      "\t    return q\n"
     ]
    }
   ],
   "source": [
    "input_ids = tokenizer(prompt, return_tensors=\"pt\").input_ids\n",
    "start = len(input_ids[0])\n",
    "output = model.generate(\n",
    "    input_ids,\n",
    "    max_length=start+400,\n",
    "    do_sample=True,\n",
    "    top_p=0.95,\n",
    "    pad_token_id=tokenizer.pad_token_id,\n",
    "    early_stopping=True,\n",
    "    temperature=1.,\n",
    "    no_repeat_ngram_size=None,\n",
    "    repetition_penalty=None,\n",
    "    num_return_sequences=None,\n",
    ")\n",
    "\n",
    "print(tokenizer.decode(output[0][start:]).strip())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "e8d5ad26-1cd6-49bb-8f4c-550091bfcc9d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "import numpy as np\n",
      "\n",
      "H, N = list(map(int, input().split()))\n",
      "\n",
      "ab = []\n",
      "for i in range(N):\n",
      "\ta, b = list(map(int, input().split()))\n",
      "\tab.append([a, b])\n",
      "\n",
      "ab = np.array(ab)\n",
      "a_list = ab[:, 0]\n",
      "b_list = ab[:, 1]\n",
      "max_a = ab.max()\n",
      "\n",
      "inf = float('inf')\n",
      "dp = np.array([inf for _ in range(H + max_a)])\n",
      "dp[0] = 0\n",
      "\n",
      "for i in range(1, len(dp)):\n",
      "\tdp[i] = np.amin(dp[np.maximum(i - a_list, 0)] + b_list)\n",
      "\n",
      "print((int(min(dp[H:]))))\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(dataset[id][\"answer\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ee80ebdb-163c-43b8-b1cb-0585d013bba1",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}