Merge branch 'evaluation' of https://github.com/ncoop57/gpt-code-clippy into evaluation

adding codebleu evaluation
This commit is contained in:
Mrinal18 2021-07-15 08:58:28 +00:00
commit 39ded45239
24 changed files with 2284 additions and 122 deletions

BIN
.DS_Store vendored Normal file

Binary file not shown.

9
.gitmodules vendored
View File

@ -1,3 +1,12 @@
[submodule "dependency_repos/github-downloader"]
path = dependency_repos/github-downloader
url = https://github.com/EleutherAI/github-downloader
[submodule "dependency_repos/apps"]
path = dependency_repos/apps
url = https://github.com/hendrycks/apps.git
[submodule "dependency_repos/human-eval"]
path = dependency_repos/human-eval
url = https://github.com/openai/human-eval
[submodule "dependency_repos/CodeXGLUE"]
path = dependency_repos/CodeXGLUE
url = https://github.com/microsoft/CodeXGLUE

@ -0,0 +1 @@
Subproject commit 3e7bfe6dc4a88534c7803ce1bd8d1733c1d16888

1
dependency_repos/apps Submodule

@ -0,0 +1 @@
Subproject commit f834ca7d7405935376aabb5830edd0c42635824e

@ -0,0 +1 @@
Subproject commit 463c980b59e818ace59f6f9803cd92c749ceae61

BIN
metrics/.DS_Store vendored Normal file

Binary file not shown.

View File

@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# Took the following from CodeXGlue Repository - https://github.com/microsoft/CodeXGLUE/blob/main/Code-Code/code-to-code-trans/evaluator/CodeBLEU/bleu.py
# The following code is taken from CodeXGlue Repository - https://github.com/microsoft/CodeXGLUE/blob/main/Code-Code/code-to-code-trans/evaluator/CodeBLEU/bleu.py
"""Python implementation of BLEU and smooth-BLEU.
@ -28,98 +28,106 @@ import math
def _get_ngrams(segment, max_order):
"""Extracts all n-grams upto a given maximum order from an input segment.
"""Extracts all n-grams upto a given maximum order from an input segment.
Args:
segment: text segment from which n-grams will be extracted.
max_order: maximum length in tokens of the n-grams returned by this
methods.
Args:
segment: text segment from which n-grams will be extracted.
max_order: maximum length in tokens of the n-grams returned by this
methods.
Returns:
The Counter containing all n-grams upto max_order in segment
with a count of how many times each n-gram occurred.
"""
ngram_counts = collections.Counter()
for order in range(1, max_order + 1):
for i in range(0, len(segment) - order + 1):
ngram = tuple(segment[i:i+order])
ngram_counts[ngram] += 1
return ngram_counts
Returns:
The Counter containing all n-grams upto max_order in segment
with a count of how many times each n-gram occurred.
"""
ngram_counts = collections.Counter()
for order in range(1, max_order + 1):
for i in range(0, len(segment) - order + 1):
ngram = tuple(segment[i : i + order])
ngram_counts[ngram] += 1
return ngram_counts
def compute_bleu(reference_corpus, translation_corpus, max_order=4,
smooth=True):
"""Computes BLEU score of translated segments against one or more references.
def compute_bleu(reference_corpus, translation_corpus, max_order=4, smooth=True):
"""Computes BLEU score of translated segments against one or more references.
Args:
reference_corpus: list of lists of references for each translation. Each
reference should be tokenized into a list of tokens.
translation_corpus: list of translations to score. Each translation
should be tokenized into a list of tokens.
max_order: Maximum n-gram order to use when computing BLEU score.
smooth: Whether or not to apply Lin et al. 2004 smoothing.
Args:
reference_corpus: list of lists of references for each translation. Each
reference should be tokenized into a list of tokens.
translation_corpus: list of translations to score. Each translation
should be tokenized into a list of tokens.
max_order: Maximum n-gram order to use when computing BLEU score.
smooth: Whether or not to apply Lin et al. 2004 smoothing.
Returns:
3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
precisions and brevity penalty.
"""
matches_by_order = [0] * max_order
possible_matches_by_order = [0] * max_order
reference_length = 0
translation_length = 0
for (references, translation) in zip(reference_corpus,
translation_corpus):
reference_length += min(len(r) for r in references)
translation_length += len(translation)
Returns:
3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
precisions and brevity penalty.
"""
matches_by_order = [0] * max_order
possible_matches_by_order = [0] * max_order
reference_length = 0
translation_length = 0
for (references, translation) in zip(reference_corpus, translation_corpus):
reference_length += min(len(r) for r in references)
translation_length += len(translation)
merged_ref_ngram_counts = collections.Counter()
for reference in references:
merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
translation_ngram_counts = _get_ngrams(translation, max_order)
overlap = translation_ngram_counts & merged_ref_ngram_counts
for ngram in overlap:
matches_by_order[len(ngram)-1] += overlap[ngram]
for order in range(1, max_order+1):
possible_matches = len(translation) - order + 1
if possible_matches > 0:
possible_matches_by_order[order-1] += possible_matches
merged_ref_ngram_counts = collections.Counter()
for reference in references:
merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
translation_ngram_counts = _get_ngrams(translation, max_order)
overlap = translation_ngram_counts & merged_ref_ngram_counts
for ngram in overlap:
matches_by_order[len(ngram) - 1] += overlap[ngram]
for order in range(1, max_order + 1):
possible_matches = len(translation) - order + 1
if possible_matches > 0:
possible_matches_by_order[order - 1] += possible_matches
precisions = [0] * max_order
for i in range(0, max_order):
if smooth:
precisions[i] = ((matches_by_order[i] + 1.) /
(possible_matches_by_order[i] + 1.))
precisions = [0] * max_order
for i in range(0, max_order):
if smooth:
precisions[i] = (matches_by_order[i] + 1.0) / (
possible_matches_by_order[i] + 1.0
)
else:
if possible_matches_by_order[i] > 0:
precisions[i] = (
float(matches_by_order[i]) / possible_matches_by_order[i]
)
else:
precisions[i] = 0.0
if min(precisions) > 0:
p_log_sum = sum((1.0 / max_order) * math.log(p) for p in precisions)
geo_mean = math.exp(p_log_sum)
else:
if possible_matches_by_order[i] > 0:
precisions[i] = (float(matches_by_order[i]) /
possible_matches_by_order[i])
else:
precisions[i] = 0.0
geo_mean = 0
if min(precisions) > 0:
p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
geo_mean = math.exp(p_log_sum)
else:
geo_mean = 0
ratio = float(translation_length) / reference_length
ratio = float(translation_length) / reference_length
if ratio > 1.0:
bp = 1.0
else:
bp = math.exp(1 - 1.0 / ratio)
bleu = geo_mean * bp
bleu_score_dict = {
"bleu": bleu,
"precision": precisions,
"bp": bp,
"ratio": ratio,
"trans_len": translation_length,
"ref_len": reference_length,
}
return bleu_score_dict # (bleu, precisions, bp, ratio, translation_length, reference_length)
if ratio > 1.0:
bp = 1.
else:
bp = math.exp(1 - 1. / ratio)
bleu = geo_mean * bp
print(geo_mean)
bleu_score_dict = {"bleu":bleu,"precision":precisions,"bp":bp,"ratio":ratio,"trans_len":translation_length,"ref_len":reference_length}
return bleu_score_dict#(bleu, precisions, bp, ratio, translation_length, reference_length)
def bleu_test_case():
"""A simple functionality test case to evaluate BLEU"""
generated = [[["a","=","b","\n","y","=","a","+","1"]]]
reference = [["a","=","b","\n","print","a"]]
score_dict = compute_bleu(generated,reference,smooth=False)
generated = [[["a", "=", "b", "\n", "y", "=", "a", "+", "1"]]]
reference = [["a", "=", "b", "\n", "print", "a"]]
score_dict = compute_bleu(generated, reference, smooth=False)
return score_dict
if __name__ == "__main__":
score_dict = bleu_test_case()
print(score_dict)
print(score_dict)

View File

@ -1,6 +1,10 @@
from metrics.bleu import compute_bleu
from metrics.parse_check import check_parse
def compute_metrics(references,generated) -> dict:
Parser = check_parse() # Initializing parser
def compute_metrics(references, generated, lang) -> dict:
"""
Calculates various metrics and returns the calculated dict of these matrics.
args:
@ -8,11 +12,12 @@ def compute_metrics(references,generated) -> dict:
reference should be tokenized into a list of tokens.
translation: list of translations to score. Each translation
should be tokenized into a list of tokens.
lang(str) : The language generated code belongs to
returns:
A dicitonary with different metrics intact.
"""
metrics_dict = {} #Update as in new metrics are added over here.
metrics_dict["smoothed_bleu_4"] = compute_bleu(references,generated,smooth=True)
metrics_dict["bleu_4"] = compute_bleu(references,generated,smooth=False)
return metrics_dict
metrics_dict = {} # Update as in new metrics are added over here.
metrics_dict["smoothed_bleu_4"] = compute_bleu(references, generated, smooth=True)
metrics_dict["bleu_4"] = compute_bleu(references, generated, smooth=False)
metrics_dict["parse_score"] = Parser(generated, lang)["parse_score"]
return metrics_dict

53
metrics/parse_check.py Normal file
View File

@ -0,0 +1,53 @@
from tree_sitter import Language, Parser
def load_tree_sitter_languages():
"""Loads language Grammars to evaluate"""
py_parser = Parser()
py_parser.set_language(Language('./tree_sitter_utils/build/my-languages.so', 'python'))
js_parser = Parser()
js_parser.set_language(Language('./tree_sitter_utils/build/my-languages.so', 'javascript'))
cpp_parser = Parser()
cpp_parser.set_language(Language('./tree_sitter_utils/build/my-languages.so', 'cpp'))
go_parser = Parser()
go_parser.set_language(Language('./tree_sitter_utils/build/my-languages.so', 'go'))
java_parser = Parser()
java_parser.set_language(Language('./tree_sitter_utils/build/my-languages.so', 'java'))
return {
"py" : py_parser,
"js" : js_parser,
"cpp" : cpp_parser,
"go" : go_parser,
"java": java_parser
}
class check_parse:
def __init__(self):
self.language_dict = load_tree_sitter_languages()
def __call__(self,batch,lang):
"""
args:
batch : list[str] of code generated by the model
lang : lang should be one of the above language_dict keys
returns:
dict(
parse_score = averaged out score on how many datapoints are parsed
index_parse = check if corresponding index is parsed
)
"""
cumulative_parse_score = 0
index_parse_list = []
parser = self.language_dict[lang]
for inp in batch:
parsed = parser.parse(bytes(inp,"utf-8"))
inp_ind_score = int("ERROR" not in parsed.root_node.sexp())
cumulative_parse_score+=inp_ind_score
index_parse_list.append(inp_ind_score)
return {"parse_score":cumulative_parse_score,"index_parse":index_parse_list}
if __name__ == "__main__":
Parse = check_parse()
score = Parse(["""
def a():
if bar:
baz()"""],"py")
print(score)

BIN
metrics/tree_sitter_utils/.DS_Store vendored Normal file

Binary file not shown.

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,102 @@
# MIT License
# Copyright (c) 2021 Dan Hendrycks and contributors.
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
"""
Run a tranined model to generate Python code.
"""
import io
import json
import logging
import math
import random
import numpy as np
import os
import pprint
import sys
import time
import transformers
import torch
from apps_utils.reindent import run as run_reindent
# for timing and debugging
from datetime import datetime, date
from tqdm import tqdm
def reindent_code(codestr):
"""
Given code string, reindent it in the same way that the
Github dataset was indented
"""
codestr = io.StringIO(codestr)
ret = io.StringIO()
run_reindent(
codestr,
ret,
config={
"dry-run": False,
"help": False,
"to": 10,
"from": -1,
"tabs": True,
"encoding": "utf-8",
"is-tabs": False,
"tabsize": 10,
"all-tabs": False,
},
)
return ret.getvalue()
def generate_prompt(
test_case_path, prompt_path, solutions_path, tokenizer, starter_path=None
):
_input = "\nQUESTION:\n"
with open(prompt_path, "r") as f:
data = f.readlines()
data = "".join(data)
_input += data
if starter_path != None:
with open(starter_path, "r") as f:
data = f.readlines()
data = "".join(data)
data = "\n" + data # + "\n"
_input += data
else:
# _input += "\n\n"
pass
with open(test_case_path, "r") as f:
data = json.load(f)
if not data.get("fn_name"):
_input += "\nUse Standard Input format" # \n"
else:
_input += "\nUse Call-Based format" # \n"
_input += "\nANSWER:\n"
return _input

View File

@ -0,0 +1,227 @@
# MIT License
# Copyright (c) 2021 Dan Hendrycks and contributors.
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
"""
Reindent files.
"""
from __future__ import print_function
import sys
import getopt
import codecs
import tempfile
import shutil
import os
def _find_indentation(line, config):
if len(line) and line[0] in (" ", "\t") and not line.isspace():
if line[0] == "\t":
config['is-tabs'] = True
# Find indentation
i = 0
for char in list(line):
if char not in (" ", "\t"):
break
i += 1
config["from"] = i
def find_indentation(line, config):
# Find indentation level used in file
if config['from'] < 0:
_find_indentation(line, config)
if config['from'] >= 0:
# Set old indent
indent = " " if not config['is-tabs'] else "\t"
indent = indent * config['from']
# Set new indent
newindent = " " if not config['tabs'] else "\t"
if not config['tabs']:
newindent = newindent * config['to']
return indent, newindent
# Continue to the next line, indentation not found
return False
def replace_inline_tabs(content, config):
newcontent = ""
imagined_i = 0
for i in range(0, len(content)):
char = content[i]
if char == '\t':
spaces = config['tabsize']-(imagined_i % config['tabsize'])
newcontent += " " * spaces
imagined_i += spaces
else:
newcontent += char
imagined_i += 1
return newcontent
def run(fd_in, fd_out, config):
from reindent_4_spaces import Reindenter
import io
inter = io.StringIO()
ri = Reindenter(fd_in)
ri.run()
ri.write(inter)
fd_in = inter
fd_in.seek(0)
while True:
line = fd_in.readline()
if not line:
break
line = line.rstrip('\r\n')
# Find indentation style used in file if not set
if config['from'] < 0:
indent = find_indentation(line, config)
if not indent:
print(line, file=fd_out)
continue
indent, newindent = indent
# Find current indentation level
level = 0
while True:
whitespace = line[:len(indent) * (level + 1)]
if whitespace == indent * (level + 1):
level += 1
else:
break
content = line[len(indent) * level:]
if config['all-tabs']:
content = replace_inline_tabs(content, config)
line = (newindent * level) + content
print(line, file=fd_out)
# print(config)
def run_files(filenames, config):
for filename in filenames:
with codecs.open(filename, encoding=config['encoding']) as fd_in:
if config['dry-run']:
print("Filename: %s" % filename)
fd_out = sys.stdout
else:
fd_out = tempfile.NamedTemporaryFile(mode='wb', delete=False)
fd_out.close()
fd_out = codecs.open(fd_out.name, "wb", encoding=config['encoding'])
run(fd_in, fd_out, config)
if not config["dry-run"]:
fd_out.close()
shutil.copy(fd_out.name, filename)
os.remove(fd_out.name)
def main(args):
config = {
"dry-run": False,
"help": False,
"to": 4,
"from": -1,
"tabs": False,
"encoding": "utf-8",
"is-tabs": False,
"tabsize": 4,
"all-tabs": False
}
possible_args = {
"d": "dry-run",
"h": "help",
"t:": "to=",
"f:": "from=",
"n": "tabs",
"e:": "encoding=",
"s:": "tabsize=",
"a": "all-tabs",
}
optlist, filenames = getopt.getopt(
args[1:],
"".join(possible_args.keys()),
possible_args.values()
)
shortargs, longargs = [], []
for shortarg in possible_args:
shortargs.append(shortarg.rstrip(":"))
longargs.append(possible_args[shortarg].rstrip("="))
for opt, val in optlist:
opt = opt.lstrip("-")
if opt in shortargs:
opt = longargs[shortargs.index(opt)]
if isinstance(config[opt], bool):
config[opt] = True
elif isinstance(config[opt], int):
config[opt] = int(val)
else:
config[opt] = val
if config['help']:
help = """
Usage: %s [options] filename(s)
Options:
-h, --help Show this message
-d, --dry-run Don't save anything, just print
the result
-t <n>, --to <n> Convert to this number of spaces
(default: 4)
-f <n>, --from <n> Convert from this number of spaces
(default: auto-detect, will also
detect tabs)
-n, --tabs Don't convert indentation to spaces,
convert to tabs instead. -t and
--to will have no effect.
-a, --all-tabs Also convert tabs used for alignment
in the code (Warning: will replace
all tabs in the file, even if inside
a string)
-s <n>, --tabsize <n> Set how many spaces one tab is
(only has an effect on -a, default: 4)
-e <s>, --encoding <s> Open files with specified encoding
(default: utf-8)
""" % args[0]
# Also removes 8 leading spaces to remove our indentation
print("\n".join([x[8:] for x in help[1:].split("\n")]))
sys.exit(0)
if filenames:
run_files(filenames, config)
else:
run(sys.stdin, sys.stdout, config)
if __name__ == "__main__":
main(sys.argv)

View File

@ -0,0 +1,158 @@
# MIT License
# Copyright (c) 2021 Dan Hendrycks and contributors.
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
"""
Run solutions from one problem.
"""
import io
import json
import logging
import math
import numpy as np
import os
import pprint
import sys
import apps_utils.testing_util as test_util
import time
# for timing debugging
from datetime import datetime, date
from pathlib import Path
from tqdm import tqdm
from typing import List
def print_results(results, args):
res = []
per_prob_res = []
all_correct = []
for index in results:
res.extend(results[index])
per_prob_res.append(np.mean(results[index]))
all_correct.append(np.all(results[index]))
tmp_results = res
compile_errors = len(tmp_results[tmp_results == -2])
runtime_errors = len(tmp_results[tmp_results == -1])
failures = len(tmp_results[tmp_results == False])
successes = len(tmp_results[tmp_results == True])
total_testcases = len(res)
if args.debug:
print(
f"number of compile errors = {compile_errors} avg = {compile_errors / total_testcases }"
)
print(
f"number of runtime errors = {runtime_errors} avg = {runtime_errors / total_testcases}"
)
print(f"number of test cases run = {total_testcases}")
print(
f"Test Case Average (average accuracy over problems) = {np.mean(per_prob_res)}"
)
print(
f"Strict Accuracy (all test cases passed / total problems) = {np.mean(all_correct)}"
)
def eval_and_save_problems(test_loc, save):
test_path = Path(test_loc)
problems = list(test_path.glob("*/"))
print(len(problems))
gpt_codes = {}
gpt_bleu = {}
gpt_codebleu = {}
results = {}
codes_loc = os.path.join(save, f"all_codes.json")
# if not os.path.exists(codes_loc):
# codes_loc = os.path.join(args.save, f"{args.start}-{args.end}_codes.json")
if os.path.exists(codes_loc):
results_loc = os.path.join(save, f"all_results.json")
print(codes_loc, results_loc)
with open(codes_loc, "r") as f:
gpt_codes = json.load(f)
# main eval loop
for index, problem in enumerate(tqdm(problems[:2])):
try:
# if args.debug:
# print(f"\n\nproblem path = {problem}")
output_str = gpt_codes[str(index)]
except:
print("CANNOT FIND OUTPUT_STR FOR", problem)
continue
prob_path = problem # os.path.join(args.root, problem)
# with open(os.path.join(prob_path, "solutions.json"), "r") as f:
# sols = json.load(f)
if not os.path.exists(save):
os.makedirs(save)
res = []
# for o_idx, o in enumerate(output_str):
# print(o)
# if args.debug:
# print(f"\nTesting solution {o_idx}")
curr_res = [-2]
try:
curr_res = test_util.run_test(
prob_path=prob_path, test=output_str, debug=False # args.debug
)
fixed = []
for e in curr_res:
if isinstance(e, np.ndarray):
e = e.item(0)
if isinstance(e, np.bool_):
e = bool(e)
fixed.append(e)
curr_res = fixed
if not np.all(curr_res):
print(f"Results were not all True: {curr_res}")
except Exception as e:
print(f"test framework exception = {repr(e)}{e}\n")
break
finally:
assert isinstance(curr_res, list)
res.append(curr_res)
# if args.debug:
# print(
# f"\nHow to read results [-2] = compile error, [-1] = runtime error [False] = failed test case [True] = passed test case"
# )
# print(f"results = {res}")
results[index] = res
with open(results_loc, "w") as f:
try:
f.write(json.dumps(results))
except Exception as e:
import pdb
pdb.set_trace()
print("didn't save problem due to {e}")
return results

View File

@ -0,0 +1,544 @@
import argparse
import json
import os
import sys
import io
import faulthandler
# used for debugging to time steps
from datetime import datetime
# to run the solution files we're using a timing based approach
import signal
import numpy as np
# for capturing the stdout
from io import StringIO
from typing import get_type_hints
from typing import List, Tuple
# used for testing the code that reads from input
from unittest.mock import patch, mock_open
from pyext import RuntimeModule
from enum import Enum
class CODE_TYPE(Enum):
call_based = 0
standard_input = 1
# stuff for setting up signal timer
class TimeoutException(Exception):
pass
def timeout_handler(signum, frame):
print("alarm went off")
#return
raise TimeoutException
signal.signal(signal.SIGALRM, timeout_handler)
timeout = 4 # seconds
# used to capture stdout as a list
# from https://stackoverflow.com/a/16571630/6416660
# alternative use redirect_stdout() from contextlib
class Capturing(list):
def __enter__(self):
self._stdout = sys.stdout
sys.stdout = self._stringio = StringIO()
# Make closing the StringIO a no-op
self._stringio.close = lambda x: 1
return self
def __exit__(self, *args):
self.extend(self._stringio.getvalue().splitlines())
del self._stringio # free up some memory
sys.stdout = self._stdout
def parse_args():
parser = argparse.ArgumentParser(description="Utility for testing code generation.")
parser.add_argument("-v", "--verbosity-level", action="store", type=int,
help="")
parser.add_argument("-s", "--source", type=str, default="leetcode",
choices=["leetcode", "atcoder", "codewars",],
help="which data source to gather from.")
parser.add_argument("-d", "--data", type=str, default="question",
choices=["question", "q", "solutions", "sol", "s", "starter", "tests", "t"],
help="which type of data to receive.")
parser.add_argument("-n", "--number", type=int, default=0,
help="which problem to query.")
args = parser.parse_args()
return args
def get_valid_problems(data_dir="leetcode"):
# these are unnecessary atm
if data_dir == "leetcode":
root = os.path.join(args.source, "data")
elif data_dir == "atcoder":
pass
root = os.path.join(data_dir, "data")
if os.path.exists(os.path.join(data_dir, "valid_problems.json")):
with open(os.path.join(data_dir, "valid_problems.json"), "r") as f:
return json.load(f)
# after we compute it once let's save it and load that instead
# TODO determine if might be better to reload each time
tmp = os.listdir(root)
valid_probs = []
for folder in tmp:
prob_path = os.path.join(root, folder)
files = os.listdir(prob_path)
#TODO add more validity checks
if "input_output.json" in files or "sols.json" in files:
valid_probs.append(prob_path)
valid_probs = sorted(valid_probs)
#with open(os.path.join(args.source,"valid_problems.json"), "w") as f:
# json.dump(valid_probs, f)
return valid_probs
def get_question(problem_list, prob_index):
root = problem_list[prob_index]
#print("get q", root)
if os.path.exists(os.path.join(root, "question.txt")):
with open(os.path.join(root, "question.txt")) as f:
question = f.readlines()
else:
print("question prompt not found")
question = ""
question = "".join(question)
return question
def get_solutions(problem_list, prob_index):
root = problem_list[prob_index]
if os.path.exists(os.path.join(root, "solutions.json")):
with open(os.path.join(root, "solutions.json")) as f:
sols = json.load(f)
return sols
def run_test(prob_path:str=None, problem_list:List[str]=None, prob_index:int=None,
test:str=None, debug:bool=False):
"""
if test is not None it'll try to run the code.
otherwise it'll just return an input and output pair.
"""
if prob_path is None and problem_list is None:
print("please provide either prob_path or problem_list")
exit()
if debug:
print(f"start = {datetime.now().time()}")
if prob_path is not None:
root = prob_path
elif problem_list is not None:
root = problem_list[prob_index]
if os.path.exists(os.path.join(root, "input_output.json")):
with open(os.path.join(root, "input_output.json")) as f:
in_outs = json.load(f)
if debug:
print(f"test cases json = {in_outs['inputs']} {in_outs['outputs']}")
if in_outs.get("fn_name") is None:
which_type = CODE_TYPE.standard_input # Standard input
method_name = None
else:
which_type = CODE_TYPE.call_based # Call-based
method_name = in_outs["fn_name"]
if debug:
print(f"loaded json = {datetime.now().time()}")
#else:
# continue
if test is None:
return in_outs
elif test is not None:
results = []
sol = "import sys\nimport time\nimport itertools\nfrom itertools import accumulate, product, permutations, combinations\nimport collections\nfrom collections import Counter, OrderedDict, deque, defaultdict, ChainMap\nfrom functools import lru_cache\nimport math\nfrom math import sqrt, sin, cos, tan, ceil, fabs, floor, gcd, exp, log, log2\nimport fractions\nfrom typing import List, Tuple\nimport numpy as np\nimport random\nimport heapq\nfrom heapq import *\n"
if debug:
print(f"loading test code = {datetime.now().time()}")
if which_type == CODE_TYPE.call_based:
sol += test
if debug: # or True:
print(f"sol = {sol}")
signal.alarm(timeout)
try:
tmp_sol = RuntimeModule.from_string("tmp_sol", "", sol)
if "class Solution" not in test:
tmp = tmp_sol
else:
tmp = tmp_sol.Solution()
signal.alarm(0)
except Exception as e:
signal.alarm(0)
print(f"type 0 compilation error = {e}")
results.append(-2)
return results
signal.alarm(0)
elif which_type == CODE_TYPE.standard_input:
# sol
tmp_test = test.split("\n")
new_test = []
for x in tmp_test:
if (not x.startswith("from ")) and (not x.startswith("import ")):
new_test.append("\t" + x + "\n")
else:
new_test.append(x + "\n")
tmp_test = new_test
new_test = ""
started = False
for i in tmp_test:
if i.startswith("\t") and not started:
new_test += "stdin = sys.stdin\nstdout = sys.stdout\n"
new_test += "def code():\n"
new_test += i
started = True
elif started and ((i.startswith("from ")) or (i.startswith("import "))):
new_test += "\t" + i
else:
new_test += i
tmp_test = new_test
sol += tmp_test
if debug:
print(f"sol = {sol}")
# print(f"{o}")
method_name = "code"
signal.alarm(timeout)
try:
tmp_sol = RuntimeModule.from_string("tmp_sol", "", sol)
tmp = tmp_sol
signal.alarm(0)
except Exception as e:
signal.alarm(0)
print(f"type 1 compilation error = {e}")
results.append(-2)
return results
signal.alarm(0)
if debug:
print(f"get method = {datetime.now().time()}")
try:
method = getattr(tmp, method_name) # get_attr second arg must be str
except:
signal.alarm(0)
e = sys.exc_info()
print(f"unable to get function error = {e}")
return results
for index, inputs in enumerate(in_outs["inputs"]):
# JSON forces dictionaries to have string keys; this undoes this (assuming a singleton list)
try:
if isinstance(inputs[0], dict):
inputs = [{int(k): v for k,v in inputs[0].items()}]
except:
True
try:
if isinstance(in_outs["outputs"][index], dict):
in_outs["outputs"][index] = [{int(k): v for k,v in in_outs["outputs"][index].items()}]
except:
True
try:
if isinstance(in_outs["outputs"][index][0], dict):
in_outs["outputs"][index] = [{int(k): v for k,v in in_outs["outputs"][index][0].items()}]
except:
True
if debug:
print(f"time: {datetime.now().time()} testing index = {index} inputs = {inputs}, {type(inputs)}. type = {which_type}")
if which_type == CODE_TYPE.call_based: # Call-based
signal.alarm(timeout)
faulthandler.enable()
try:
# print("------------")
# print(inputs)
output = method(*inputs)
# ground truth sequences are not tuples
if isinstance(output, tuple):
output = list(output)
tmp_result = output == in_outs["outputs"][index]
if isinstance(in_outs["outputs"][index], list) and in_outs["outputs"][index]:
tmp_result = tmp_result or (output == in_outs["outputs"][index][0])
# ground truth sequences are not tuples
try:
if isinstance(output[0], tuple):
tmp_result = tmp_result or ([list(x) for x in output] == in_outs["outputs"][index][0])
except:
True
results.append(tmp_result)
# reset the alarm
signal.alarm(0)
except Exception as e:
signal.alarm(0)
faulthandler.disable()
print(f"Standard input runtime error or time limit exceeded error = {e}")
results.append(-1)
continue
faulthandler.disable()
signal.alarm(0)
if debug:
print(f"outputs = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
elif which_type == CODE_TYPE.standard_input: # Standard input
faulthandler.enable()
signal.alarm(timeout)
passed = False
if isinstance(inputs, list):
inputs = "\n".join(inputs)
if isinstance(in_outs['outputs'][index], list):
in_outs['outputs'][index] = "\n".join(in_outs['outputs'][index])
with Capturing() as output:
try:
call_method(method, inputs)
# reset the alarm
signal.alarm(0)
passed = True
except Exception as e:
# runtime error or took too long
signal.alarm(0)
print(f"Call-based runtime error or time limit exceeded error = {repr(e)}{e}")
results.append(-1)
signal.alarm(0)
if not passed:
if debug:
nl = "\n"
if not isinstance(inputs, list):
print(f"not passed output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
else:
print(f"not passed output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
continue
if passed and debug:
print(f"==> output = {output}, test outputs = {in_outs['outputs'][index]}")
if custom_compare_(output, in_outs['outputs'][index]):
tmp_result = True
results.append(tmp_result)
continue
# ground truth sequences are expressed as lists not tuples
if isinstance(output, tuple):
output = list(output)
tmp_result = False
try:
tmp_result = (output == [in_outs["outputs"][index]])
if isinstance(in_outs["outputs"][index], list):
tmp_result = tmp_result or (output == in_outs["outputs"][index])
if isinstance(output[0], str):
tmp_result = tmp_result or ([e.strip() for e in output] == in_outs["outputs"][index])
except Exception as e:
print(f"Failed check1 exception = {e}")
pass
if tmp_result == True:
results.append(tmp_result)
continue
# try one more time without \n
if isinstance(in_outs["outputs"][index], list):
for tmp_index, i in enumerate(in_outs["outputs"][index]):
in_outs["outputs"][index][tmp_index] = i.split("\n")
in_outs["outputs"][index][tmp_index] = [x.strip() for x in in_outs["outputs"][index][tmp_index] if x]
else:
in_outs["outputs"][index] = in_outs["outputs"][index].split("\n")
in_outs["outputs"][index] = list(filter(len, in_outs["outputs"][index]))
in_outs["outputs"][index] = list(map(lambda x:x.strip(), in_outs["outputs"][index]))
try:
tmp_result = (output == [in_outs["outputs"][index]])
if isinstance(in_outs["outputs"][index], list):
tmp_result = tmp_result or (output == in_outs["outputs"][index])
except Exception as e:
print(f"Failed check2 exception = {e}")
pass
if tmp_result == True:
results.append(tmp_result)
continue
# try by converting the output into a split up list too
if isinstance(output, list):
output = list(filter(len, output))
if debug:
nl = "\n"
if not isinstance(inputs, list):
print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
else:
print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
if tmp_result == True:
results.append(tmp_result)
continue
try:
tmp_result = (output == [in_outs["outputs"][index]])
if isinstance(in_outs["outputs"][index], list):
tmp_result = tmp_result or (output == in_outs["outputs"][index])
except Exception as e:
print(f"Failed check3 exception = {e}")
pass
try:
output_float = [float(e) for e in output]
gt_float = [float(e) for e in in_outs['outputs'][index]]
tmp_result = tmp_result or ((len(output_float) == len(gt_float)) and np.allclose(output_float, gt_float))
except Exception as e:
pass
try:
if isinstance(output[0], list):
output_float = [float(e) for e in output[0]]
gt_float = [float(e) for e in in_outs['outputs'][index][0]]
tmp_result = tmp_result or ((len(output_float) == len(gt_float)) and np.allclose(output_float, gt_float))
except Exception as e:
pass
if tmp_result == True:
results.append(tmp_result)
continue
# try by converting the stuff into split up list
if isinstance(in_outs["outputs"][index], list):
for tmp_index, i in enumerate(in_outs["outputs"][index]):
in_outs["outputs"][index][tmp_index] = set(i.split())
else:
in_outs["outputs"][index] = set(in_outs["outputs"][index].split())
try:
tmp_result = (output == in_outs["outputs"][index])
except Exception as e:
print(f"Failed check4 exception = {e}")
continue
if tmp_result == True:
results.append(tmp_result)
continue
# try by converting the output into a split up list too
if isinstance(output, list):
for tmp_index, i in enumerate(output):
output[tmp_index] = i.split()
output = list(filter(len, output))
for tmp_index, i in enumerate(output):
output[tmp_index] = set(i)
else:
output = output.split()
output = list(filter(len, output))
output = set(output)
try:
tmp_result = (set(frozenset(s) for s in output) == set(frozenset(s) for s in in_outs["outputs"][index]))
except Exception as e:
print(f"Failed check5 exception = {e}")
# if they are all numbers, round so that similar numbers are treated as identical
try:
tmp_result = tmp_result or (set(frozenset(round(float(t),3) for t in s) for s in output) ==\
set(frozenset(round(float(t),3) for t in s) for s in in_outs["outputs"][index]))
except Exception as e:
print(f"Failed check6 exception = {e}")
if tmp_result == True and debug:
print("PASSED")
results.append(tmp_result)
if debug:
nl = "\n"
if not isinstance(inputs, list):
print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
else:
print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
return results
def custom_compare_(output, ground_truth):
if isinstance(output, list):
output_1 = "\n".join(output)
if stripped_string_compare(output_1, ground_truth):
return True
if isinstance(output, list):
output_2 = [o.lstrip().rstrip() for o in output]
output_2 = "\n".join(output_2)
if stripped_string_compare(output_2, ground_truth):
return True
return False
def stripped_string_compare(s1, s2):
s1 = s1.lstrip().rstrip()
s2 = s2.lstrip().rstrip()
return s1 == s2
def call_method(method, inputs):
if isinstance(inputs, list):
inputs = "\n".join(inputs)
inputs_line_iterator = iter(inputs.split("\n"))
# sys.setrecursionlimit(10000)
# @patch('builtins.input', side_effect=inputs.split("\n"))
@patch('builtins.open', mock_open(read_data=inputs))
@patch('sys.stdin', StringIO(inputs))
@patch('sys.stdin.readline', lambda *args: next(inputs_line_iterator))
@patch('sys.stdin.readlines', lambda *args: inputs.split("\n"))
@patch('sys.stdin.read', lambda *args: inputs)
# @patch('sys.stdout.write', print)
def _inner_call_method(_method):
try:
return _method()
except SystemExit as e:
pass
finally:
pass
return _inner_call_method(method)
def main(args):
print(args)
problem_list = sorted(get_valid_problems(args.source))
print(f"number of problems = {len(problem_list)}")
prob_index = args.number
print(f"problem is {problem_list[prob_index]}")
# This checks it correctly loaded. remove this later
assert prob_index < len(problem_list)
if args.data == "q" or args.data == "question":
tmp = get_question(problem_list, prob_index)
print("q", tmp)
elif args.data in ["solutions", "sol", "s",]:
tmp = get_solutions(problem_list, prob_index)
print("sol", tmp)
elif args.data == "starter":
tmp = get_starter(problem_list, prob_index)
print("starter", tmp)
elif args.data in ["test", "t"]:
# test it with sols
sols = get_solutions(problem_list, prob_index)
tmp = run_test(problem_list, prob_index, test=sols[0])
print("results = ", tmp)
print("-2 = compile error, -1 is runtime error, False failed test, True passed test")
if __name__ == "__main__":
args = parse_args()
main(args)

View File

@ -0,0 +1,4 @@
from datasets import load_dataset
dataset = load_dataset("code_x_glue_ct_code_to_text", "go")
print(dataset)

View File

@ -0,0 +1,23 @@
import pandas as pd
from datasets import load_dataset, load_metric
from fastcore.script import *
from pathlib import Path
bleu = load_metric("sacrebleu")
predictions = ["hello there kenobi", "foo bar foobar"]
references = [
["hello there general kenobi"],
["foo bar foobar"], # , "hello there !"], # , "foo bar foobar"],
]
@call_parse
def main(concode_path: Param("Path to the concode data in CodeXGLUE", str)):
concode_path = Path(concode_path)
dataset = load_dataset("json", data_files=str(concode_path / "test.json"))
print(dataset)
results = bleu.compute(predictions=predictions, references=references)
print(list(results.keys()))
print(round(results["score"], 1))

View File

@ -0,0 +1,165 @@
import json
import torch
import pandas as pd
# import apps.eval.reident
from apps_utils.generate_gpt_codes import generate_prompt
from apps_utils.test_one_solution import eval_and_save_problems
from datasets import load_dataset, load_metric
from fastcore.script import *
from human_eval.data import write_jsonl, read_problems
from pathlib import Path
from metrics.extrinsic_eval import compute_metrics
from subprocess import check_output
from transformers import AutoTokenizer, AutoModelWithLMHead
bleu = load_metric("sacrebleu")
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
model = AutoModelWithLMHead.from_pretrained(
"/home/nathan/gpt-code-clippy/data/APPS/models/1.5B"
)
def generate_text(prompt):
# print(prompt)
input_ids = torch.LongTensor(tokenizer.encode(prompt, verbose=False)).unsqueeze(
0
) # .cuda()
output_ids = model.generate(
input_ids,
num_beams=2,
early_stopping=True,
max_length=1024 - len(input_ids),
)
output_str = tokenizer.decode(output_ids[0])
return output_str
# # "a", "=", "b", "\n", "y", "=", "a", "+", "1"
# return "a = b \n y = a + 1"
def _eval_concode(path):
# TODO: format input to model same as App and OpenAI HumanEval datasets are formatted
data = load_dataset("json", data_files=str(path / "test.json"))["train"]
predictions = [[]]
references = []
for example in data:
output = generate_text(example["nl"])
predictions[0].append(output.split(" "))
references.append(example["code"].split(" "))
results = compute_metrics(predictions, references)
print(f"Bleu score for Concode dataset: {results}")
def _eval_apps(path):
gpt_codes = {}
prob_paths = sorted(path.glob("*/"))
# map prob_paths to strings and save as a json file
str_paths = [str(p) for p in prob_paths]
with open(path / "test.json", "w") as f:
json.dump(str_paths, f)
for index, prob_path in enumerate(prob_paths[:2]):
test_case_path = prob_path / "input_output.json"
prompt_path = prob_path / "question.txt"
starter_path = prob_path / "starter_code.py"
solutions_path = prob_path / "solutions.json"
if not starter_path.exists():
starter_path = None
if not test_case_path.exists() or not prompt_path.exists():
continue
prompt = generate_prompt(
Args(),
test_case_path,
prompt_path,
solutions_path,
tokenizer,
starter_path=starter_path,
)
output = generate_text(prompt)
print(output)
# print(output)
gpt_codes[index] = output
# print(output)
with open(path.parent / "all_codes.json", "w") as f:
json.dump(gpt_codes, f)
eval_and_save_problems(path, path.parent)
# execute bash command to run eval script
# results = check_output(
# [
# # python3 test_one_solution.py -t /path/to/apps/test --save /path/to/save_dir --print_results
# "python",
# "./apps_utils/test_one_solution.py",
# "-t",
# str(path),
# "--save",
# str(path.parent),
# "--print_results",
# ]
# ).decode("utf-8")
# test_case_path = os.path.join(prob_path, "input_output.json")
# prompt_path = os.path.join(prob_path, "question.txt")
# starter_path = os.path.join(prob_path, "starter_code.py")
# solutions_path = os.path.join(prob_path, "solutions.json")
# generate_prompt(args, test_case_path, prompt_path, solutions_path, tokenizer, starter_path=None)
def _eval_human_eval(path):
problems = read_problems()
num_samples_per_task = 1
samples = [
dict(
task_id=task_id,
completion=generate_text(problems[task_id]["prompt"]),
)
for task_id in problems
for _ in range(num_samples_per_task)
]
write_jsonl("human_eval.jsonl", samples)
# execute bash command to run eval script
results = check_output(
[
"python",
path / "evaluate_functional_correctness.py",
"human_eval.jsonl",
]
).decode("utf-8")
print(results)
@call_parse
def main(
concode_path: Param("Path to the concode data in CodeXGLUE", str),
apps_path: Param("Path to the the App dataset", str),
human_eval_path: Param("Path to the human eval dataset", str),
):
concode_path = Path(concode_path)
apps_path = Path(apps_path)
human_eval_path = Path(human_eval_path)
# _eval_concode(concode_path)
# _eval_human_eval(human_eval_path)
_eval_apps(apps_path)
# dataset = load_dataset("json", data_files=str(concode_path / "test.json"))
# print(dataset)
# results = bleu.compute(predictions=predictions, references=references)
# print(list(results.keys()))
# print(round(results["score"], 1))
# problems = read_problems()
# print(problems)
# num_samples_per_task = 200
# samples = [
# dict(
# task_id=task_id,
# completion=generate_text(problems[task_id]["prompt"]),
# )
# for task_id in problems[:1]
# for _ in range(num_samples_per_task)
# ]
# write_jsonl("human_eval.jsonl", samples)

View File

@ -0,0 +1,164 @@
{"task_id": "HumanEval/0", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/1", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/2", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/3", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/4", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/5", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/6", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/7", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/8", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/9", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/10", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/11", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/12", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/13", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/14", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/15", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/16", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/17", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/18", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/19", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/20", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/21", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/22", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/23", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/24", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/25", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/26", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/27", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/28", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/29", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/30", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/31", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/32", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/33", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/34", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/35", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/36", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/37", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/38", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/39", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/40", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/41", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/42", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/43", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/44", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/45", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/46", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/47", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/48", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/49", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/50", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/51", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/52", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/53", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/54", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/55", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/56", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/57", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/58", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/59", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/60", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/61", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/62", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/63", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/64", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/65", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/66", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/67", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/68", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/69", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/70", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/71", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/72", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/73", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/74", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/75", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/76", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/77", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/78", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/79", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/80", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/81", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/82", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/83", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/84", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/85", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/86", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/87", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/88", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/89", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/90", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/91", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/92", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/93", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/94", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/95", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/96", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/97", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/98", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/99", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/100", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/101", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/102", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/103", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/104", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/105", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/106", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/107", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/108", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/109", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/110", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/111", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/112", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/113", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/114", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/115", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/116", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/117", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/118", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/119", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/120", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/121", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/122", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/123", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/124", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/125", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/126", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/127", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/128", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/129", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/130", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/131", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/132", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/133", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/134", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/135", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/136", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/137", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/138", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/139", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/140", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/141", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/142", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/143", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/144", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/145", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/146", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/147", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/148", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/149", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/150", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/151", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/152", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/153", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/154", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/155", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/156", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/157", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/158", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/159", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/160", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/161", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/162", "completion": "a = b \n y = a + 1"}
{"task_id": "HumanEval/163", "completion": "a = b \n y = a + 1"}

View File

@ -0,0 +1,164 @@
{"task_id": "HumanEval/0", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
{"task_id": "HumanEval/1", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
{"task_id": "HumanEval/2", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
{"task_id": "HumanEval/3", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
{"task_id": "HumanEval/4", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
{"task_id": "HumanEval/5", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
{"task_id": "HumanEval/6", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
{"task_id": "HumanEval/7", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
{"task_id": "HumanEval/8", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
{"task_id": "HumanEval/9", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
{"task_id": "HumanEval/10", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 21)", "passed": false}
{"task_id": "HumanEval/11", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
{"task_id": "HumanEval/12", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
{"task_id": "HumanEval/13", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
{"task_id": "HumanEval/14", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 10)", "passed": false}
{"task_id": "HumanEval/15", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
{"task_id": "HumanEval/16", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
{"task_id": "HumanEval/17", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 18)", "passed": false}
{"task_id": "HumanEval/18", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
{"task_id": "HumanEval/19", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
{"task_id": "HumanEval/20", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
{"task_id": "HumanEval/21", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
{"task_id": "HumanEval/22", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
{"task_id": "HumanEval/23", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
{"task_id": "HumanEval/24", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 9)", "passed": false}
{"task_id": "HumanEval/25", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
{"task_id": "HumanEval/26", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
{"task_id": "HumanEval/27", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 9)", "passed": false}
{"task_id": "HumanEval/28", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
{"task_id": "HumanEval/29", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
{"task_id": "HumanEval/30", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
{"task_id": "HumanEval/31", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 21)", "passed": false}
{"task_id": "HumanEval/32", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 25)", "passed": false}
{"task_id": "HumanEval/33", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
{"task_id": "HumanEval/34", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 9)", "passed": false}
{"task_id": "HumanEval/35", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
{"task_id": "HumanEval/36", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
{"task_id": "HumanEval/37", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
{"task_id": "HumanEval/38", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 19)", "passed": false}
{"task_id": "HumanEval/39", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 18)", "passed": false}
{"task_id": "HumanEval/40", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 21)", "passed": false}
{"task_id": "HumanEval/41", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
{"task_id": "HumanEval/42", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
{"task_id": "HumanEval/43", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 20)", "passed": false}
{"task_id": "HumanEval/44", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
{"task_id": "HumanEval/45", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 9)", "passed": false}
{"task_id": "HumanEval/46", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 19)", "passed": false}
{"task_id": "HumanEval/47", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
{"task_id": "HumanEval/48", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
{"task_id": "HumanEval/49", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
{"task_id": "HumanEval/50", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
{"task_id": "HumanEval/51", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 20)", "passed": false}
{"task_id": "HumanEval/52", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
{"task_id": "HumanEval/53", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
{"task_id": "HumanEval/54", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 20)", "passed": false}
{"task_id": "HumanEval/55", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
{"task_id": "HumanEval/56", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
{"task_id": "HumanEval/57", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
{"task_id": "HumanEval/58", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
{"task_id": "HumanEval/59", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
{"task_id": "HumanEval/60", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
{"task_id": "HumanEval/61", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
{"task_id": "HumanEval/62", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
{"task_id": "HumanEval/63", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 18)", "passed": false}
{"task_id": "HumanEval/64", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 19)", "passed": false}
{"task_id": "HumanEval/65", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
{"task_id": "HumanEval/66", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
{"task_id": "HumanEval/67", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
{"task_id": "HumanEval/68", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 37)", "passed": false}
{"task_id": "HumanEval/69", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
{"task_id": "HumanEval/70", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
{"task_id": "HumanEval/71", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
{"task_id": "HumanEval/72", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 21)", "passed": false}
{"task_id": "HumanEval/73", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
{"task_id": "HumanEval/74", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
{"task_id": "HumanEval/75", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
{"task_id": "HumanEval/76", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
{"task_id": "HumanEval/77", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
{"task_id": "HumanEval/78", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 21)", "passed": false}
{"task_id": "HumanEval/79", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
{"task_id": "HumanEval/80", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
{"task_id": "HumanEval/81", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 28)", "passed": false}
{"task_id": "HumanEval/82", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
{"task_id": "HumanEval/83", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 8)", "passed": false}
{"task_id": "HumanEval/84", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
{"task_id": "HumanEval/85", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 10)", "passed": false}
{"task_id": "HumanEval/86", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
{"task_id": "HumanEval/87", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 23)", "passed": false}
{"task_id": "HumanEval/88", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 18)", "passed": false}
{"task_id": "HumanEval/89", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
{"task_id": "HumanEval/90", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
{"task_id": "HumanEval/91", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
{"task_id": "HumanEval/92", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 21)", "passed": false}
{"task_id": "HumanEval/93", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
{"task_id": "HumanEval/94", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
{"task_id": "HumanEval/95", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
{"task_id": "HumanEval/96", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
{"task_id": "HumanEval/97", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
{"task_id": "HumanEval/98", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
{"task_id": "HumanEval/99", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 21)", "passed": false}
{"task_id": "HumanEval/100", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
{"task_id": "HumanEval/101", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
{"task_id": "HumanEval/102", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
{"task_id": "HumanEval/103", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
{"task_id": "HumanEval/104", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
{"task_id": "HumanEval/105", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 25)", "passed": false}
{"task_id": "HumanEval/106", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
{"task_id": "HumanEval/107", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 26)", "passed": false}
{"task_id": "HumanEval/108", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
{"task_id": "HumanEval/109", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 30)", "passed": false}
{"task_id": "HumanEval/110", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
{"task_id": "HumanEval/111", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
{"task_id": "HumanEval/112", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
{"task_id": "HumanEval/113", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
{"task_id": "HumanEval/114", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
{"task_id": "HumanEval/115", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 38)", "passed": false}
{"task_id": "HumanEval/116", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
{"task_id": "HumanEval/117", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
{"task_id": "HumanEval/118", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 18)", "passed": false}
{"task_id": "HumanEval/119", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 18)", "passed": false}
{"task_id": "HumanEval/120", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 28)", "passed": false}
{"task_id": "HumanEval/121", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
{"task_id": "HumanEval/122", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
{"task_id": "HumanEval/123", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 20)", "passed": false}
{"task_id": "HumanEval/124", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 23)", "passed": false}
{"task_id": "HumanEval/125", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
{"task_id": "HumanEval/126", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 19)", "passed": false}
{"task_id": "HumanEval/127", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 23)", "passed": false}
{"task_id": "HumanEval/128", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
{"task_id": "HumanEval/129", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 33)", "passed": false}
{"task_id": "HumanEval/130", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 20)", "passed": false}
{"task_id": "HumanEval/131", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
{"task_id": "HumanEval/132", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
{"task_id": "HumanEval/133", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
{"task_id": "HumanEval/134", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
{"task_id": "HumanEval/135", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
{"task_id": "HumanEval/136", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
{"task_id": "HumanEval/137", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
{"task_id": "HumanEval/138", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 10)", "passed": false}
{"task_id": "HumanEval/139", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
{"task_id": "HumanEval/140", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
{"task_id": "HumanEval/141", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
{"task_id": "HumanEval/142", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
{"task_id": "HumanEval/143", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 23)", "passed": false}
{"task_id": "HumanEval/144", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
{"task_id": "HumanEval/145", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
{"task_id": "HumanEval/146", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
{"task_id": "HumanEval/147", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
{"task_id": "HumanEval/148", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 19)", "passed": false}
{"task_id": "HumanEval/149", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 18)", "passed": false}
{"task_id": "HumanEval/150", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
{"task_id": "HumanEval/151", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
{"task_id": "HumanEval/152", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 18)", "passed": false}
{"task_id": "HumanEval/153", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 20)", "passed": false}
{"task_id": "HumanEval/154", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
{"task_id": "HumanEval/155", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 10)", "passed": false}
{"task_id": "HumanEval/156", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
{"task_id": "HumanEval/157", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
{"task_id": "HumanEval/158", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
{"task_id": "HumanEval/159", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 32)", "passed": false}
{"task_id": "HumanEval/160", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 28)", "passed": false}
{"task_id": "HumanEval/161", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
{"task_id": "HumanEval/162", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 10)", "passed": false}
{"task_id": "HumanEval/163", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}

View File

View File

@ -0,0 +1,133 @@
# Copyright 2017 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# The following code is taken from CodeXGlue Repository - https://github.com/microsoft/CodeXGLUE/blob/main/Code-Code/code-to-code-trans/evaluator/CodeBLEU/bleu.py
"""Python implementation of BLEU and smooth-BLEU.
This module provides a Python implementation of BLEU and smooth-BLEU.
Smooth BLEU is computed following the method outlined in the paper:
Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
evaluation metrics for machine translation. COLING 2004.
"""
import collections
import math
def _get_ngrams(segment, max_order):
"""Extracts all n-grams upto a given maximum order from an input segment.
Args:
segment: text segment from which n-grams will be extracted.
max_order: maximum length in tokens of the n-grams returned by this
methods.
Returns:
The Counter containing all n-grams upto max_order in segment
with a count of how many times each n-gram occurred.
"""
ngram_counts = collections.Counter()
for order in range(1, max_order + 1):
for i in range(0, len(segment) - order + 1):
ngram = tuple(segment[i : i + order])
ngram_counts[ngram] += 1
return ngram_counts
def compute_bleu(reference_corpus, translation_corpus, max_order=4, smooth=True):
"""Computes BLEU score of translated segments against one or more references.
Args:
reference_corpus: list of lists of references for each translation. Each
reference should be tokenized into a list of tokens.
translation_corpus: list of translations to score. Each translation
should be tokenized into a list of tokens.
max_order: Maximum n-gram order to use when computing BLEU score.
smooth: Whether or not to apply Lin et al. 2004 smoothing.
Returns:
3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
precisions and brevity penalty.
"""
matches_by_order = [0] * max_order
possible_matches_by_order = [0] * max_order
reference_length = 0
translation_length = 0
for (references, translation) in zip(reference_corpus, translation_corpus):
reference_length += min(len(r) for r in references)
translation_length += len(translation)
merged_ref_ngram_counts = collections.Counter()
for reference in references:
merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
translation_ngram_counts = _get_ngrams(translation, max_order)
overlap = translation_ngram_counts & merged_ref_ngram_counts
for ngram in overlap:
matches_by_order[len(ngram) - 1] += overlap[ngram]
for order in range(1, max_order + 1):
possible_matches = len(translation) - order + 1
if possible_matches > 0:
possible_matches_by_order[order - 1] += possible_matches
precisions = [0] * max_order
for i in range(0, max_order):
if smooth:
precisions[i] = (matches_by_order[i] + 1.0) / (
possible_matches_by_order[i] + 1.0
)
else:
if possible_matches_by_order[i] > 0:
precisions[i] = (
float(matches_by_order[i]) / possible_matches_by_order[i]
)
else:
precisions[i] = 0.0
if min(precisions) > 0:
p_log_sum = sum((1.0 / max_order) * math.log(p) for p in precisions)
geo_mean = math.exp(p_log_sum)
else:
geo_mean = 0
ratio = float(translation_length) / reference_length
if ratio > 1.0:
bp = 1.0
else:
bp = math.exp(1 - 1.0 / ratio)
bleu = geo_mean * bp
bleu_score_dict = {
"bleu": bleu,
"precision": precisions,
"bp": bp,
"ratio": ratio,
"trans_len": translation_length,
"ref_len": reference_length,
}
return bleu_score_dict # (bleu, precisions, bp, ratio, translation_length, reference_length)
def bleu_test_case():
"""A simple functionality test case to evaluate BLEU"""
generated = [[["a", "=", "b", "\n", "y", "=", "a", "+", "1"]]]
reference = [["a", "=", "b", "\n", "print", "a"]]
score_dict = compute_bleu(generated, reference, smooth=False)
return score_dict
if __name__ == "__main__":
score_dict = bleu_test_case()
print(score_dict)

View File

@ -0,0 +1,46 @@
from metrics.bleu import compute_bleu
def compute_exact_match(references, generated) -> float:
"""
Computes Exact Match Accuracy.
args:
reference: list of lists of references for each translation. Each
reference should be tokenized into a list of tokens.
translation: list of translations to score. Each translation
should be tokenized into a list of tokens.
returns:
exact_match_accuracy : Float
"""
assert (
len(references[0]) == len(generated),
"Number of Samples should be equal in References and Synthesized Outputs..",
)
exact_match_count = 0.0
for gen, ref in zip(generated, references[0]):
if gen == ref:
exact_match_count += 1
exact_match_acc = exact_match_count / len(generated)
return exact_match_acc
def compute_metrics(references, generated) -> dict:
"""
Calculates various metrics and returns the calculated dict of these matrics.
args:
reference: list of lists of references for each translation. Each
reference should be tokenized into a list of tokens.
translation: list of translations to score. Each translation
should be tokenized into a list of tokens.
returns:
A dicitonary with different metrics intact.
"""
metrics_dict = {
"smoothed_bleu_4": None,
"bleu_4": None,
"exact_match_acc": None,
} # Update as in new metrics are computed.
metrics_dict["smoothed_bleu_4"] = compute_bleu(references, generated, smooth=True)
metrics_dict["bleu_4"] = compute_bleu(references, generated, smooth=False)
metrics_dict["exact_match_acc"] = compute_exact_match(references, generated)
return metrics_dict

View File

@ -0,0 +1,31 @@
import os
import pandas as pd
from fastcore.script import *
from ghapi.all import GhApi
GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN")
# Open issue on repo using custom title and body
def get_license_info(owner, repo):
api = GhApi(owner=owner, repo=repo, token=GITHUB_TOKEN)
license = api.licenses.get_for_repo(owner=owner, repo=repo)
return license.license.name
@call_parse
def main(repos_path: Param("Path to the csv containing all of the repos", str)):
"""
Use pandas dataframe from the repos path to open issues in each of them.
"""
repos_path = Path(repos_path)
df = pd.read_csv(repos_path)
# Loop through repos and get their license
licenses = []
for _, row in df.iterrows():
owner, repo = row["name"].split("/")
licenses.append(get_license_info(owner, repo))
df["license"] = licenses
df.to_csv(repos_path.parent/f"{repos_path.stem}_with_license.csv", index=False)