Merge branch 'main' of github.com:ncoop57/gpt-code-clippy into main

This commit is contained in:
ncoop57 2021-07-20 20:24:33 +00:00
commit 4d091e2b02
5 changed files with 2080 additions and 254 deletions

256
code-clippy-app.ipynb Normal file
View File

@ -0,0 +1,256 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "333a4f54-e120-4969-8adf-32b98655ff41",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2021-07-18 23:45:31.083087: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n",
"2021-07-18 23:45:31.083131: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n"
]
}
],
"source": [
"import gradio as gr\n",
"\n",
"from transformers import AutoModelForCausalLM, AutoTokenizer"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "d875c6bc-8d97-4e03-9ee4-a4bd47f191bf",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/arto/transformers/src/transformers/modeling_flax_pytorch_utils.py:201: UserWarning: The given NumPy array is not writeable, and PyTorch does not support non-writeable tensors. This means you can write to the underlying (supposedly non-writeable) NumPy array using the tensor. You may want to copy the array to protect its data or make it writeable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /pytorch/torch/csrc/utils/tensor_numpy.cpp:180.)\n",
" pt_model_dict[flax_key] = torch.from_numpy(flax_tensor)\n",
"All Flax model weights were used when initializing GPTNeoForCausalLM.\n",
"\n",
"Some weights of GPTNeoForCausalLM were not initialized from the Flax model and are newly initialized: ['transformer.h.4.attn.attention.bias', 'transformer.h.8.attn.attention.masked_bias', 'transformer.h.0.attn.attention.bias', 'transformer.h.2.attn.attention.masked_bias', 'transformer.h.2.attn.attention.bias', 'transformer.h.8.attn.attention.bias', 'transformer.h.10.attn.attention.masked_bias', 'transformer.h.11.attn.attention.masked_bias', 'transformer.h.3.attn.attention.masked_bias', 'transformer.h.9.attn.attention.masked_bias', 'transformer.h.6.attn.attention.masked_bias', 'transformer.h.7.attn.attention.masked_bias', 'transformer.h.6.attn.attention.bias', 'transformer.h.0.attn.attention.masked_bias', 'transformer.h.1.attn.attention.masked_bias', 'transformer.h.5.attn.attention.masked_bias', 'transformer.h.4.attn.attention.masked_bias', 'transformer.h.10.attn.attention.bias', 'lm_head.weight']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
]
}
],
"source": [
"model_name = \"/home/shared/models/gpt-code-clippy-125M-apps-lr-adam1e-4-bs128/ckpt-1633/\"\n",
"model = AutoModelForCausalLM.from_pretrained(model_name, from_flax=True)\n",
"tokenizer = AutoTokenizer.from_pretrained(\"EleutherAI/gpt-neo-125M\")\n",
"tokenizer.pad_token = tokenizer.eos_token"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "e6ebb017-e784-4311-a795-4bacd2263d19",
"metadata": {},
"outputs": [],
"source": [
"def format_input(question, starter_code=\"\"):\n",
" answer_type = \"\\nUse Call-Based format\\n\" if starter_code else \"\\nUse Standard Input format\\n\"\n",
" return f\"\\nQUESTION:\\n{question}\\n{starter_code}\\n{answer_type}\\nANSWER:\\n\""
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "b00ad3ab-3086-48f0-940a-3129a7dff30a",
"metadata": {},
"outputs": [],
"source": [
"def format_outputs(text):\n",
" formatted_text =f'''\n",
" <head>\n",
" <link rel=\"stylesheet\"\n",
" href=\"https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.0.3/styles/default.min.css\">\n",
" <script src=\"https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.0.3/highlight.min.js\"></script>\n",
" <script>hljs.initHighlightingOnLoad();</script>\n",
" </head>\n",
" <body>\n",
" <pre><code class=\"python\">{text}</code></pre>\n",
" </body>\n",
" '''\n",
" return formatted_text"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "28abdab5-962e-47af-83b2-4f9c491ba705",
"metadata": {},
"outputs": [],
"source": [
"def generate_solution(question, starter_code=\"\", temperature=1., num_beams=1):\n",
" prompt = format_input(question, starter_code)\n",
" input_ids = tokenizer(prompt, return_tensors=\"pt\").input_ids\n",
" start = len(input_ids[0])\n",
" output = model.generate(\n",
" input_ids,\n",
" max_length=start+200,\n",
" do_sample=True,\n",
" top_p=0.95,\n",
" pad_token_id=tokenizer.pad_token_id,\n",
" early_stopping=True,\n",
" temperature=1.,\n",
" num_beams=int(num_beams),\n",
" no_repeat_ngram_size=None,\n",
" repetition_penalty=None,\n",
" num_return_sequences=None,\n",
" )\n",
" \n",
" return format_outputs(tokenizer.decode(output[0][start:]).strip())"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "a156ff76-7e50-40c9-8b3e-e9ddeb450501",
"metadata": {},
"outputs": [],
"source": [
"_EXAMPLES = [\n",
" [\n",
" \"\"\"\n",
"Given a 2D list of size `m * n`. Your task is to find the sum of minimum value in each row.\n",
"For Example:\n",
"```python\n",
"[\n",
" [1, 2, 3, 4, 5], # minimum value of row is 1\n",
" [5, 6, 7, 8, 9], # minimum value of row is 5\n",
" [20, 21, 34, 56, 100] # minimum value of row is 20\n",
"]\n",
"```\n",
"So, the function should return `26` because sum of minimums is as `1 + 5 + 20 = 26`\n",
" \"\"\",\n",
" \"\",\n",
" 0.8,\n",
" ],\n",
" [\n",
" \"\"\"\n",
"# Personalized greeting\n",
"\n",
"Create a function that gives a personalized greeting. This function takes two parameters: `name` and `owner`.\n",
" \"\"\",\n",
" \"\"\"\n",
"Use conditionals to return the proper message:\n",
"\n",
"case| return\n",
"--- | ---\n",
"name equals owner | 'Hello boss'\n",
"otherwise | 'Hello guest'\n",
"def greet(name, owner):\n",
" \"\"\",\n",
" 0.8,\n",
" ]\n",
"] "
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "fb3d90fc-6932-4343-9c86-70ae94ca95aa",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Running locally at: http://127.0.0.1:7861/\n",
"This share link will expire in 24 hours. If you need a permanent link, visit: https://gradio.app/introducing-hosted (NEW!)\n",
"Running on External URL: https://34711.gradio.app\n",
"Interface loading below...\n"
]
},
{
"data": {
"text/html": [
"\n",
" <iframe\n",
" width=\"900\"\n",
" height=\"500\"\n",
" src=\"https://34711.gradio.app\"\n",
" frameborder=\"0\"\n",
" allowfullscreen\n",
" ></iframe>\n",
" "
],
"text/plain": [
"<IPython.lib.display.IFrame at 0x7f0bf02db670>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(<Flask 'gradio.networking'>,\n",
" 'http://127.0.0.1:7861/',\n",
" 'https://34711.gradio.app')"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"inputs = [\n",
" gr.inputs.Textbox(placeholder=\"Define a problem here...\", lines=7),\n",
" gr.inputs.Textbox(placeholder=\"Provide optional starter code...\", lines=3),\n",
" gr.inputs.Slider(0.5, 1.5, 0.1, default=0.8, label=\"Temperature\"),\n",
" gr.inputs.Slider(1,4,1,default=1, label=\"Beam size\")\n",
"]\n",
"\n",
"outputs = [\n",
" gr.outputs.HTML(label=\"Solution\")\n",
"]\n",
"\n",
"gr.Interface(\n",
" generate_solution, \n",
" inputs=inputs, \n",
" outputs=outputs,\n",
" title=\"Code Clippy: Problem Solver\",\n",
" examples=_EXAMPLES,\n",
").launch(share=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "737cd94c-5286-4832-9611-06e6f2a89357",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

145
demo_backup/app.py Normal file
View File

@ -0,0 +1,145 @@
import gradio as gr
from rich.console import Console
from rich.syntax import Syntax
from transformers import AutoModelForCausalLM, AutoTokenizer
import requests
import json
import webbrowser
# model_name = "flax-community/gpt-code-clippy-1.3B-apps-alldata"
model_name = "flax-community/gpt-code-clippy-125M-apps-alldata"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
console = Console(record=True)
def format_input(question, starter_code=""):
answer_type = (
"\
Use Call-Based format\
" if starter_code else "\
Use Standard Input format\
"
)
return f"\
QUESTION:\
{question}\
{starter_code}\
{answer_type}\
ANSWER:\
"
def format_outputs(text):
formatted_text = Syntax(
text, "python", line_numbers=True, indent_guides=True, word_wrap=True
)
console.print(formatted_text)
return console.export_html(inline_styles=True)
def generate_solution(question, starter_code="", temperature=1.0, num_beams=1):
prompt = format_input(question, starter_code)
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
start = len(input_ids[0])
output = model.generate(
input_ids,
max_length=start + 200,
do_sample=True,
top_p=0.95,
pad_token_id=tokenizer.pad_token_id,
early_stopping=True,
temperature=temperature,
num_beams=int(num_beams),
no_repeat_ngram_size=None,
repetition_penalty=None,
num_return_sequences=None,
)
return format_outputs(
tokenizer.decode(output[0][start:], skip_special_tokens=True).strip()
)
_EXAMPLES = [
[
"""
Given a 2D list of size `m * n`. Your task is to find the sum of minimum value in each row.
For Example:
```python
[
[1, 2, 3, 4, 5], # minimum value of row is 1
[5, 6, 7, 8, 9], # minimum value of row is 5
[20, 21, 34, 56, 100] # minimum value of row is 20
]
```
So, the function should return `26` because sum of minimums is as `1 + 5 + 20 = 26`
""",
"",
0.8,
],
[
"""
# Personalized greeting
Create a function that gives a personalized greeting. This function takes two parameters: `name` and `owner`.
""",
"""
Use conditionals to return the proper message:
case| return
--- | ---
name equals owner | 'Hello boss'
otherwise | 'Hello guest'
def greet(name, owner):
""",
0.8,
],
]
inputs = [
gr.inputs.Textbox(placeholder="Define a problem here...", lines=7),
gr.inputs.Textbox(placeholder="Provide optional starter code...", lines=3),
gr.inputs.Slider(0.5, 1.5, 0.1, default=0.8, label="Temperature"),
gr.inputs.Slider(1, 4, 1, default=1, label="Beam size"),
gr.inputs.Textbox(lines=1, label="Your GitHub API token")
]
outputs = [gr.outputs.HTML(label="Solution")]
print(outputs)
# adding carbon support
GITHUB_API="https://api.github.com"
API_TOKEN=gr.inputs.Textbox(label="Your GitHub API token")
#form a request URL
url=GITHUB_API+"/gists"
#print headers,parameters,payload
headers={'Authorization':'token %s'%API_TOKEN}
params={'scope':'gist'}
payload={outputs}
res=requests.post(url,headers=headers,params=params,data=json.dumps(payload))
col = st.beta_columns([2, 4])
if col.button("Create a 'carbon' copy"):
carbon_url='https://carbon.now.sh/'+res.text.split(',')[0].split('/')[-1][:-1]
webbrowser.open_new(carbon_url)
gr.Interface(
generate_solution,
inputs=inputs,
outputs=outputs,
title="Code Clippy: Problem Solver",
examples=_EXAMPLES,
).launch(share=False)

View File

@ -1,6 +1,6 @@
#! /bin/bash
./run_clm_apps.py \
--output_dir /home/shared/models/gpt-code-clippy-1.3B-apps \
--output_dir /home/shared/models/gpt-code-clippy-1.3B-apps-3 \
--model_name_or_path EleutherAI/gpt-neo-1.3B \
--dataset_name ./apps.py \
--dataset_config_name formatted \
@ -24,11 +24,13 @@
--skip_memory_metrics="False" \
--save_steps="1000" \
--save_strategy epoch \
--save_total_limit 2 \
--save_total_limit="None" \
--gradient_accumulation_steps 1 \
--adafactor true \
--all_data true \
--all_data false \
--seed 842 \
--save_optimizer false \
--max_eval_samples 20000
# --resume_from_checkpoint $HOME/gpt-neo-125M-code-clippy/ckpt_201 \
# --max_train_samples="10000" \
# --max_eval_samples="1000"

View File

@ -10,8 +10,8 @@
"name": "stderr",
"output_type": "stream",
"text": [
"2021-07-18 07:38:07.042553: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n",
"2021-07-18 07:38:07.042596: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n"
"2021-07-18 19:07:02.959520: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n",
"2021-07-18 19:07:02.959564: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n"
]
}
],
@ -36,24 +36,38 @@
}
],
"source": [
"dataset = load_dataset(\"/home/arto/datasets/datasets/apps/apps.py\", \"formatted\", split=\"train\")"
"dataset = load_dataset(\"/home/arto/datasets/datasets/apps/apps.py\", \"formatted\", split=\"test\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 3,
"id": "3811070c-c6a0-4a84-9362-cf7de0d5bd75",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/arto/transformers/src/transformers/modeling_flax_pytorch_utils.py:201: UserWarning: The given NumPy array is not writeable, and PyTorch does not support non-writeable tensors. This means you can write to the underlying (supposedly non-writeable) NumPy array using the tensor. You may want to copy the array to protect its data or make it writeable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /pytorch/torch/csrc/utils/tensor_numpy.cpp:180.)\n",
" pt_model_dict[flax_key] = torch.from_numpy(flax_tensor)\n",
"All Flax model weights were used when initializing GPTNeoForCausalLM.\n",
"\n",
"Some weights of GPTNeoForCausalLM were not initialized from the Flax model and are newly initialized: ['transformer.h.0.attn.attention.bias', 'transformer.h.6.attn.attention.masked_bias', 'transformer.h.3.attn.attention.masked_bias', 'transformer.h.2.attn.attention.bias', 'transformer.h.10.attn.attention.masked_bias', 'transformer.h.9.attn.attention.masked_bias', 'transformer.h.1.attn.attention.masked_bias', 'transformer.h.7.attn.attention.masked_bias', 'transformer.h.2.attn.attention.masked_bias', 'lm_head.weight', 'transformer.h.10.attn.attention.bias', 'transformer.h.11.attn.attention.masked_bias', 'transformer.h.4.attn.attention.masked_bias', 'transformer.h.8.attn.attention.masked_bias', 'transformer.h.0.attn.attention.masked_bias', 'transformer.h.5.attn.attention.masked_bias', 'transformer.h.4.attn.attention.bias', 'transformer.h.6.attn.attention.bias', 'transformer.h.8.attn.attention.bias']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
]
}
],
"source": [
"# model = AutoModelForCausalLM.from_pretrained(\"/home/arto/gpt-code-clippy-lr1e-4-bs1024-f/ckpt-80000\",from_flax=True)\n",
"model = AutoModelForCausalLM.from_pretrained(\"/home/shared/models/gpt-code-clippy-125M-apps-lr-5e-5/ckpt-8169/\",from_flax=True)\n",
"# model = AutoModelForCausalLM.from_pretrained(\"EleutherAI/gpt-neo-125M\")\n",
"tokenizer = AutoTokenizer.from_pretrained(\"EleutherAI/gpt-neo-125M\")\n",
"tokenizer.pad_token = tokenizer.eos_token"
]
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 36,
"id": "830a374f-a70d-466d-a766-3671e5c11765",
"metadata": {},
"outputs": [
@ -63,85 +77,70 @@
"text": [
"id :\n",
"\n",
"6386\n",
"33905\n",
"==========\n",
"question :\n",
"\n",
"\n",
"QUESTION:\n",
"Given an array arr.  You can choose a set of integers and remove all the occurrences of these integers in the array.\n",
"Return the minimum size of the set so that at least half of the integers of the array are removed.\n",
" \n",
"Example 1:\n",
"Input: arr = [3,3,3,3,5,5,5,2,2,7]\n",
"Output: 2\n",
"Explanation: Choosing {3,7} will make the new array [5,5,5,2,2] which has size 5 (i.e equal to half of the size of the old array).\n",
"Possible sets of size 2 are {3,5},{3,2},{5,2}.\n",
"Choosing set {2,7} is not possible as it will make the new array [3,3,3,3,5,5,5] which has size greater than half of the size of the old array.\n",
"There are N empty boxes arranged in a row from left to right.\n",
"The integer i is written on the i-th box from the left (1 \\leq i \\leq N).\n",
"For each of these boxes, Snuke can choose either to put a ball in it or to put nothing in it.\n",
"We say a set of choices to put a ball or not in the boxes is good when the following condition is satisfied:\n",
" - For every integer i between 1 and N (inclusive), the total number of balls contained in the boxes with multiples of i written on them is congruent to a_i modulo 2.\n",
"Does there exist a good set of choices? If the answer is yes, find one good set of choices.\n",
"\n",
"Example 2:\n",
"Input: arr = [7,7,7,7,7,7]\n",
"Output: 1\n",
"Explanation: The only possible set you can choose is {7}. This will make the new array empty.\n",
"-----Constraints-----\n",
" - All values in input are integers.\n",
" - 1 \\leq N \\leq 2 \\times 10^5\n",
" - a_i is 0 or 1.\n",
"\n",
"Example 3:\n",
"Input: arr = [1,9]\n",
"Output: 1\n",
"-----Input-----\n",
"Input is given from Standard Input in the following format:\n",
"N\n",
"a_1 a_2 ... a_N\n",
"\n",
"Example 4:\n",
"Input: arr = [1000,1000,3,7]\n",
"Output: 1\n",
"-----Output-----\n",
"If a good set of choices does not exist, print -1.\n",
"If a good set of choices exists, print one such set of choices in the following format:\n",
"M\n",
"b_1 b_2 ... b_M\n",
"\n",
"Example 5:\n",
"Input: arr = [1,2,3,4,5,6,7,8,9,10]\n",
"Output: 5\n",
"where M denotes the number of boxes that will contain a ball, and b_1,\\ b_2,\\ ...,\\ b_M are the integers written on these boxes, in any order.\n",
"\n",
" \n",
"Constraints:\n",
"-----Sample Input-----\n",
"3\n",
"1 0 0\n",
"\n",
"1 <= arr.length <= 10^5\n",
"arr.length is even.\n",
"1 <= arr[i] <= 10^5\n",
"class Solution:\n",
" def minSetSize(self, arr: List[int]) -> int:\n",
" \n",
"-----Sample Output-----\n",
"1\n",
"1\n",
"\n",
"Use Call-Based format\n",
"Consider putting a ball only in the box with 1 written on it.\n",
" - There are three boxes with multiples of 1 written on them: the boxes with 1, 2, and 3. The total number of balls contained in these boxes is 1.\n",
" - There is only one box with a multiple of 2 written on it: the box with 2. The total number of balls contained in these boxes is 0.\n",
" - There is only one box with a multiple of 3 written on it: the box with 3. The total number of balls contained in these boxes is 0.\n",
"Thus, the condition is satisfied, so this set of choices is good.\n",
"\n",
"\n",
"Use Standard Input format\n",
"\n",
"ANSWER:\n",
"\n",
"==========\n",
"answer :\n",
"\n",
"class Solution:\n",
"\tdef minSetSize(self, arr: List[int]) -> int:\n",
"\t\t# get length of array \n",
"\t\tlength = len(arr)\n",
"\t\t# build dict to count how many times each int appears\n",
"\t\tcounts = {}\n",
"\t\tfor num in arr:\n",
"\t\t\tif num not in counts:\n",
"\t\t\t\tcounts[num] =1\n",
"\t\t\telse:\n",
"\t\t\t\tcounts[num] += 1\n",
"\t\t\t\t\n",
"\t\t# print(counts)\n",
"\t\t\n",
"\t\t# get values from dict, sort in descending order\n",
"\t\tdescending = sorted(counts.values(), reverse = True)\n",
"\t\t# print(descending)\n",
"\t\t# initialize 2 variables: count and total\n",
"\t\tcount = 0\n",
"\t\ttotal = 0\n",
"\t\t# loop over descending list of counts\n",
"\t\tfor num in descending:\n",
"\t\t\t# add each number to our total\n",
"\t\t\ttotal += num\n",
"\t\t\t# increment count by 1\n",
"\t\t\tcount += 1\n",
"\t\t\t# if our total is half or more, return count\n",
"\t\t\tif total >= length/2:\n",
"\t\t\t\treturn count\n",
"N = int(input())\n",
"A = list(map(int,input().split()))\n",
"B = (N+1)*[0]\n",
"\n",
"for n in range(N,0,-1):\n",
"\tB[n] = (sum(B[n::n])+A[n-1])%2\n",
"\n",
"print(B.count(1))\n",
"for n in range(N+1):\n",
"\tif B[n]:\n",
"\t\tprint(n,end=\" \")\n",
"\n",
"==========\n"
]
@ -161,223 +160,332 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 11,
"id": "1b109cec-c721-410f-b3df-38a50ae8607b",
"metadata": {},
"outputs": [],
"source": [
"prompt = dataset[82239][\"question\"]"
"# prompt = dataset[82239][\"question\"]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "820f8729-a368-4b41-91ff-56601d21aaaf",
"execution_count": 37,
"id": "8fb6f26e-bc50-40e6-b373-3d9e50993b22",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
"Using custom data configuration formatted-d9019fd8ed858445\n",
"Reusing dataset apps (/home/shared/.cache/hf/datasets/apps/formatted-d9019fd8ed858445/0.1.0/5987476458cc986e36654364319c6fe798b880d64a35518cbc00dc04f3c41e4d)\n"
]
},
}
],
"source": [
"train_dataset = load_dataset(\"/home/arto/datasets/datasets/apps/apps.py\", \"formatted\", split=\"train\")"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "0eb58b5c-4a3b-48cb-82e1-e68b792f806e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"QUESTION:\n",
"~~~if:csharp,javascript,cfml,php\n",
"Given a 2D array of size `m * n`. Your task is to find the sum of minimum value in each row.\n",
"~~~\n",
"~~~if:cpp\n",
"Given a 2D vector of size `m * n`. Your task is to find the sum of minimum value in each row.\n",
"~~~\n",
"~~~[\n",
" [1, 2, 3, 4, 5, 6], # smallest number is 1\n",
" [4, 5, 6], # largest number is 1\n",
" [18, 17, 19, 20, 21, 22], # smallest number is 18\n",
"~~~if:python,ruby\n",
"Given a 2D list of size `m * n`. Your task is to find the sum of minimum value in each row.\n",
"~~~\n",
"\n",
"For Example:\n",
"```python\n",
"[\n",
" [1, 2, 3, 4, 5], # minimum value of row is 1\n",
" [5, 6, 7, 8, 9], # minimum value of row is 5\n",
" [20, 21, 34, 56, 100] # minimum value of row is 20\n",
"]\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"~~~\n",
"```\n",
"So, the function should return `26` because sum of minimums is as `1 + 5 + 20 = 26`\n",
"\n",
"~~~if:javascript,php\n",
"Note: You will be always given non-empty array containing Positive values.\n",
"~~~\n",
"~~~if:python\n",
"Note: You will be always given non-empty list containing Positive values.\n",
"~~~\n",
"~~~if:cpp\n",
"Note: You will be always given non-empty vector containing Positive values.\n",
"~~~\n",
"~~~if:c#\n",
"Note: You will be always given non-empty vector containing Positive values.\n",
"~~~\n",
"~~~if:cfml\n",
"Note: You will be always given non-empty array containing Positive values.\n",
"~~~\n",
"\n",
"ENJOY CODING :)\n",
"def sum_of_minimums(numbers):\n",
"\t\n",
"\n",
"Use Call-Based format\n",
"\n",
"ANSWER:\n",
"\n"
]
}
],
"source": [
"print(train_dataset[82239][\"question\"])"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "4decd052-f96b-4469-abfa-25beb9ab805c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id :\n",
"\n",
"78360\n",
"==========\n",
"question :\n",
"\n",
"\n",
"QUESTION:\n",
"# Personalized greeting\n",
"\n",
"Create a function that gives a personalized greeting. This function takes two parameters: `name` and `owner`.\n",
"\n",
"Use conditionals to return the proper message:\n",
"\n",
"case | return\n",
"--- | ---\n",
"name equals owner | 'Hello boss'\n",
"otherwise | 'Hello guest'\n",
"def greet(name, owner):\n",
"\t\n",
"\n",
"Use Call-Based format\n",
"\n",
"ANSWER:\n",
"\n",
"==========\n",
"answer :\n",
"\n",
"def greet(name, owner):\n",
"\tgreet = 'guest'\n",
"\tif name == owner: \n",
"\t\tgreet = 'boss'\n",
"\treturn 'Hello ' + greet\n",
"\n",
"==========\n"
]
}
],
"source": [
"import random\n",
"\n",
"id = random.randint(0, len(train_dataset)-1)\n",
"sample = train_dataset[id]\n",
"\n",
"for k, v in sample.items():\n",
" print(k, \":\\n\")\n",
" print(v)\n",
" print(\"=\"*10)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "d3be9853-0ce7-40a5-976e-6ff082a7ed2f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"115212"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(dataset)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "eea91645-fbbc-4495-a829-357cd2833a6b",
"metadata": {},
"outputs": [],
"source": [
"id = 115212-10"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "2b72f491-8a44-4206-ba06-bc1bc6ba011c",
"metadata": {},
"outputs": [],
"source": [
"prompt = dataset[id][\"question\"]"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "54768ef7-3752-45da-b5c8-721893c9b254",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"QUESTION:\n",
"Given an array of integers, return indices of the two numbers such that they add up to a specific target.\n",
"\n",
"You may assume that each input would have exactly one solution, and you may not use the same element twice.\n",
"\n",
"Example:\n",
"\n",
"\n",
"Given nums = [2, 7, 11, 15], target = 9,\n",
"\n",
"Because nums[0] + nums[1] = 2 + 7 = 9,\n",
"return [0, 1].\n",
"class Solution:\n",
" def twoSum(self, nums: List[int], target: int) -> List[int]:\n",
" \n",
"\n",
"Use Call-Based format\n",
"\n",
"ANSWER:\n",
"\n"
]
}
],
"source": [
"print(prompt)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "820f8729-a368-4b41-91ff-56601d21aaaf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"class Solution:\n",
"\tdef twoSum(self, nums, target):\n",
"\t \"\"\"\n",
"\t :type nums: List[int]\n",
"\t :type target: int\n",
"\t :rtype: List[int]\n",
"\t \"\"\"\n",
"\t a = {}\n",
"\t b = {}\n",
"\t for i in range(1, target+1):\n",
"\t\t a[i] += nums[i]\n",
"\t \n",
"\t a.clear()\n",
"\t for i in range(1, target+1):\n",
"\t\t a.get(i, 0)\n",
"\t return sorted(a)\n",
"\t\t\t\n",
"\t\t\n",
"\tdef helper(n):\n",
"\t s = 1\n",
"\t q ='sum'\n",
"\t while n!= s:\n",
"\t\t n += 1\n",
"\t\t q +='sum'\n",
"\t\t s += q\n",
"\t\t n += 1\n",
"\t return q\n"
]
}
],
"source": [
"input_ids = tokenizer(prompt, return_tensors=\"pt\").input_ids\n",
"start = len(input_ids[0])\n",
"output = model.generate(\n",
" input_ids,\n",
" max_length=1000,\n",
" max_length=start+400,\n",
" do_sample=True,\n",
" top_p=0.95,\n",
" pad_token_id=tokenizer.pad_token_id,\n",
" early_stopping=True,\n",
" temperature=1.,\n",
" no_repeat_ngram_size=None,\n",
" repetition_penalty=None,\n",
" num_return_sequences=None,\n",
")\n",
"\n",
"print(tokenizer.decode(output[0][start:]))"
"print(tokenizer.decode(output[0][start:]).strip())"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "e8d5ad26-1cd6-49bb-8f4c-550091bfcc9d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"import numpy as np\n",
"\n",
"H, N = list(map(int, input().split()))\n",
"\n",
"ab = []\n",
"for i in range(N):\n",
"\ta, b = list(map(int, input().split()))\n",
"\tab.append([a, b])\n",
"\n",
"ab = np.array(ab)\n",
"a_list = ab[:, 0]\n",
"b_list = ab[:, 1]\n",
"max_a = ab.max()\n",
"\n",
"inf = float('inf')\n",
"dp = np.array([inf for _ in range(H + max_a)])\n",
"dp[0] = 0\n",
"\n",
"for i in range(1, len(dp)):\n",
"\tdp[i] = np.amin(dp[np.maximum(i - a_list, 0)] + b_list)\n",
"\n",
"print((int(min(dp[H:]))))\n",
"\n"
]
}
],
"source": [
"print(dataset[id][\"answer\"])"
]
},
{

File diff suppressed because it is too large Load Diff