mirror of
https://github.com/CodedotAl/gpt-code-clippy.git
synced 2024-08-16 10:20:28 +03:00
Merge branch 'evaluation' of https://github.com/ncoop57/gpt-code-clippy into evaluation
adding codebleu evaluation
This commit is contained in:
commit
39ded45239
9
.gitmodules
vendored
9
.gitmodules
vendored
@ -1,3 +1,12 @@
|
||||
[submodule "dependency_repos/github-downloader"]
|
||||
path = dependency_repos/github-downloader
|
||||
url = https://github.com/EleutherAI/github-downloader
|
||||
[submodule "dependency_repos/apps"]
|
||||
path = dependency_repos/apps
|
||||
url = https://github.com/hendrycks/apps.git
|
||||
[submodule "dependency_repos/human-eval"]
|
||||
path = dependency_repos/human-eval
|
||||
url = https://github.com/openai/human-eval
|
||||
[submodule "dependency_repos/CodeXGLUE"]
|
||||
path = dependency_repos/CodeXGLUE
|
||||
url = https://github.com/microsoft/CodeXGLUE
|
||||
|
1
dependency_repos/CodeXGLUE
Submodule
1
dependency_repos/CodeXGLUE
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit 3e7bfe6dc4a88534c7803ce1bd8d1733c1d16888
|
1
dependency_repos/apps
Submodule
1
dependency_repos/apps
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit f834ca7d7405935376aabb5830edd0c42635824e
|
1
dependency_repos/human-eval
Submodule
1
dependency_repos/human-eval
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit 463c980b59e818ace59f6f9803cd92c749ceae61
|
BIN
metrics/.DS_Store
vendored
Normal file
BIN
metrics/.DS_Store
vendored
Normal file
Binary file not shown.
162
metrics/bleu.py
162
metrics/bleu.py
@ -12,7 +12,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
# Took the following from CodeXGlue Repository - https://github.com/microsoft/CodeXGLUE/blob/main/Code-Code/code-to-code-trans/evaluator/CodeBLEU/bleu.py
|
||||
# The following code is taken from CodeXGlue Repository - https://github.com/microsoft/CodeXGLUE/blob/main/Code-Code/code-to-code-trans/evaluator/CodeBLEU/bleu.py
|
||||
|
||||
|
||||
"""Python implementation of BLEU and smooth-BLEU.
|
||||
@ -28,98 +28,106 @@ import math
|
||||
|
||||
|
||||
def _get_ngrams(segment, max_order):
|
||||
"""Extracts all n-grams upto a given maximum order from an input segment.
|
||||
"""Extracts all n-grams upto a given maximum order from an input segment.
|
||||
|
||||
Args:
|
||||
segment: text segment from which n-grams will be extracted.
|
||||
max_order: maximum length in tokens of the n-grams returned by this
|
||||
methods.
|
||||
Args:
|
||||
segment: text segment from which n-grams will be extracted.
|
||||
max_order: maximum length in tokens of the n-grams returned by this
|
||||
methods.
|
||||
|
||||
Returns:
|
||||
The Counter containing all n-grams upto max_order in segment
|
||||
with a count of how many times each n-gram occurred.
|
||||
"""
|
||||
ngram_counts = collections.Counter()
|
||||
for order in range(1, max_order + 1):
|
||||
for i in range(0, len(segment) - order + 1):
|
||||
ngram = tuple(segment[i:i+order])
|
||||
ngram_counts[ngram] += 1
|
||||
return ngram_counts
|
||||
Returns:
|
||||
The Counter containing all n-grams upto max_order in segment
|
||||
with a count of how many times each n-gram occurred.
|
||||
"""
|
||||
ngram_counts = collections.Counter()
|
||||
for order in range(1, max_order + 1):
|
||||
for i in range(0, len(segment) - order + 1):
|
||||
ngram = tuple(segment[i : i + order])
|
||||
ngram_counts[ngram] += 1
|
||||
return ngram_counts
|
||||
|
||||
|
||||
def compute_bleu(reference_corpus, translation_corpus, max_order=4,
|
||||
smooth=True):
|
||||
"""Computes BLEU score of translated segments against one or more references.
|
||||
def compute_bleu(reference_corpus, translation_corpus, max_order=4, smooth=True):
|
||||
"""Computes BLEU score of translated segments against one or more references.
|
||||
|
||||
Args:
|
||||
reference_corpus: list of lists of references for each translation. Each
|
||||
reference should be tokenized into a list of tokens.
|
||||
translation_corpus: list of translations to score. Each translation
|
||||
should be tokenized into a list of tokens.
|
||||
max_order: Maximum n-gram order to use when computing BLEU score.
|
||||
smooth: Whether or not to apply Lin et al. 2004 smoothing.
|
||||
Args:
|
||||
reference_corpus: list of lists of references for each translation. Each
|
||||
reference should be tokenized into a list of tokens.
|
||||
translation_corpus: list of translations to score. Each translation
|
||||
should be tokenized into a list of tokens.
|
||||
max_order: Maximum n-gram order to use when computing BLEU score.
|
||||
smooth: Whether or not to apply Lin et al. 2004 smoothing.
|
||||
|
||||
Returns:
|
||||
3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
|
||||
precisions and brevity penalty.
|
||||
"""
|
||||
matches_by_order = [0] * max_order
|
||||
possible_matches_by_order = [0] * max_order
|
||||
reference_length = 0
|
||||
translation_length = 0
|
||||
for (references, translation) in zip(reference_corpus,
|
||||
translation_corpus):
|
||||
reference_length += min(len(r) for r in references)
|
||||
translation_length += len(translation)
|
||||
Returns:
|
||||
3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
|
||||
precisions and brevity penalty.
|
||||
"""
|
||||
matches_by_order = [0] * max_order
|
||||
possible_matches_by_order = [0] * max_order
|
||||
reference_length = 0
|
||||
translation_length = 0
|
||||
for (references, translation) in zip(reference_corpus, translation_corpus):
|
||||
reference_length += min(len(r) for r in references)
|
||||
translation_length += len(translation)
|
||||
|
||||
merged_ref_ngram_counts = collections.Counter()
|
||||
for reference in references:
|
||||
merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
|
||||
translation_ngram_counts = _get_ngrams(translation, max_order)
|
||||
overlap = translation_ngram_counts & merged_ref_ngram_counts
|
||||
for ngram in overlap:
|
||||
matches_by_order[len(ngram)-1] += overlap[ngram]
|
||||
for order in range(1, max_order+1):
|
||||
possible_matches = len(translation) - order + 1
|
||||
if possible_matches > 0:
|
||||
possible_matches_by_order[order-1] += possible_matches
|
||||
merged_ref_ngram_counts = collections.Counter()
|
||||
for reference in references:
|
||||
merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
|
||||
translation_ngram_counts = _get_ngrams(translation, max_order)
|
||||
overlap = translation_ngram_counts & merged_ref_ngram_counts
|
||||
for ngram in overlap:
|
||||
matches_by_order[len(ngram) - 1] += overlap[ngram]
|
||||
for order in range(1, max_order + 1):
|
||||
possible_matches = len(translation) - order + 1
|
||||
if possible_matches > 0:
|
||||
possible_matches_by_order[order - 1] += possible_matches
|
||||
|
||||
precisions = [0] * max_order
|
||||
for i in range(0, max_order):
|
||||
if smooth:
|
||||
precisions[i] = ((matches_by_order[i] + 1.) /
|
||||
(possible_matches_by_order[i] + 1.))
|
||||
precisions = [0] * max_order
|
||||
for i in range(0, max_order):
|
||||
if smooth:
|
||||
precisions[i] = (matches_by_order[i] + 1.0) / (
|
||||
possible_matches_by_order[i] + 1.0
|
||||
)
|
||||
else:
|
||||
if possible_matches_by_order[i] > 0:
|
||||
precisions[i] = (
|
||||
float(matches_by_order[i]) / possible_matches_by_order[i]
|
||||
)
|
||||
else:
|
||||
precisions[i] = 0.0
|
||||
|
||||
if min(precisions) > 0:
|
||||
p_log_sum = sum((1.0 / max_order) * math.log(p) for p in precisions)
|
||||
geo_mean = math.exp(p_log_sum)
|
||||
else:
|
||||
if possible_matches_by_order[i] > 0:
|
||||
precisions[i] = (float(matches_by_order[i]) /
|
||||
possible_matches_by_order[i])
|
||||
else:
|
||||
precisions[i] = 0.0
|
||||
geo_mean = 0
|
||||
|
||||
if min(precisions) > 0:
|
||||
p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
|
||||
geo_mean = math.exp(p_log_sum)
|
||||
else:
|
||||
geo_mean = 0
|
||||
ratio = float(translation_length) / reference_length
|
||||
|
||||
ratio = float(translation_length) / reference_length
|
||||
if ratio > 1.0:
|
||||
bp = 1.0
|
||||
else:
|
||||
bp = math.exp(1 - 1.0 / ratio)
|
||||
bleu = geo_mean * bp
|
||||
bleu_score_dict = {
|
||||
"bleu": bleu,
|
||||
"precision": precisions,
|
||||
"bp": bp,
|
||||
"ratio": ratio,
|
||||
"trans_len": translation_length,
|
||||
"ref_len": reference_length,
|
||||
}
|
||||
return bleu_score_dict # (bleu, precisions, bp, ratio, translation_length, reference_length)
|
||||
|
||||
if ratio > 1.0:
|
||||
bp = 1.
|
||||
else:
|
||||
bp = math.exp(1 - 1. / ratio)
|
||||
bleu = geo_mean * bp
|
||||
print(geo_mean)
|
||||
bleu_score_dict = {"bleu":bleu,"precision":precisions,"bp":bp,"ratio":ratio,"trans_len":translation_length,"ref_len":reference_length}
|
||||
return bleu_score_dict#(bleu, precisions, bp, ratio, translation_length, reference_length)
|
||||
|
||||
def bleu_test_case():
|
||||
"""A simple functionality test case to evaluate BLEU"""
|
||||
generated = [[["a","=","b","\n","y","=","a","+","1"]]]
|
||||
reference = [["a","=","b","\n","print","a"]]
|
||||
score_dict = compute_bleu(generated,reference,smooth=False)
|
||||
generated = [[["a", "=", "b", "\n", "y", "=", "a", "+", "1"]]]
|
||||
reference = [["a", "=", "b", "\n", "print", "a"]]
|
||||
score_dict = compute_bleu(generated, reference, smooth=False)
|
||||
return score_dict
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
score_dict = bleu_test_case()
|
||||
print(score_dict)
|
||||
print(score_dict)
|
||||
|
@ -1,6 +1,10 @@
|
||||
from metrics.bleu import compute_bleu
|
||||
from metrics.parse_check import check_parse
|
||||
|
||||
def compute_metrics(references,generated) -> dict:
|
||||
Parser = check_parse() # Initializing parser
|
||||
|
||||
|
||||
def compute_metrics(references, generated, lang) -> dict:
|
||||
"""
|
||||
Calculates various metrics and returns the calculated dict of these matrics.
|
||||
args:
|
||||
@ -8,11 +12,12 @@ def compute_metrics(references,generated) -> dict:
|
||||
reference should be tokenized into a list of tokens.
|
||||
translation: list of translations to score. Each translation
|
||||
should be tokenized into a list of tokens.
|
||||
lang(str) : The language generated code belongs to
|
||||
returns:
|
||||
A dicitonary with different metrics intact.
|
||||
"""
|
||||
metrics_dict = {} #Update as in new metrics are added over here.
|
||||
metrics_dict["smoothed_bleu_4"] = compute_bleu(references,generated,smooth=True)
|
||||
metrics_dict["bleu_4"] = compute_bleu(references,generated,smooth=False)
|
||||
|
||||
return metrics_dict
|
||||
metrics_dict = {} # Update as in new metrics are added over here.
|
||||
metrics_dict["smoothed_bleu_4"] = compute_bleu(references, generated, smooth=True)
|
||||
metrics_dict["bleu_4"] = compute_bleu(references, generated, smooth=False)
|
||||
metrics_dict["parse_score"] = Parser(generated, lang)["parse_score"]
|
||||
return metrics_dict
|
||||
|
53
metrics/parse_check.py
Normal file
53
metrics/parse_check.py
Normal file
@ -0,0 +1,53 @@
|
||||
from tree_sitter import Language, Parser
|
||||
|
||||
def load_tree_sitter_languages():
|
||||
"""Loads language Grammars to evaluate"""
|
||||
py_parser = Parser()
|
||||
py_parser.set_language(Language('./tree_sitter_utils/build/my-languages.so', 'python'))
|
||||
js_parser = Parser()
|
||||
js_parser.set_language(Language('./tree_sitter_utils/build/my-languages.so', 'javascript'))
|
||||
cpp_parser = Parser()
|
||||
cpp_parser.set_language(Language('./tree_sitter_utils/build/my-languages.so', 'cpp'))
|
||||
go_parser = Parser()
|
||||
go_parser.set_language(Language('./tree_sitter_utils/build/my-languages.so', 'go'))
|
||||
java_parser = Parser()
|
||||
java_parser.set_language(Language('./tree_sitter_utils/build/my-languages.so', 'java'))
|
||||
return {
|
||||
"py" : py_parser,
|
||||
"js" : js_parser,
|
||||
"cpp" : cpp_parser,
|
||||
"go" : go_parser,
|
||||
"java": java_parser
|
||||
}
|
||||
|
||||
class check_parse:
|
||||
def __init__(self):
|
||||
self.language_dict = load_tree_sitter_languages()
|
||||
def __call__(self,batch,lang):
|
||||
"""
|
||||
args:
|
||||
batch : list[str] of code generated by the model
|
||||
lang : lang should be one of the above language_dict keys
|
||||
|
||||
returns:
|
||||
dict(
|
||||
parse_score = averaged out score on how many datapoints are parsed
|
||||
index_parse = check if corresponding index is parsed
|
||||
)
|
||||
"""
|
||||
cumulative_parse_score = 0
|
||||
index_parse_list = []
|
||||
parser = self.language_dict[lang]
|
||||
for inp in batch:
|
||||
parsed = parser.parse(bytes(inp,"utf-8"))
|
||||
inp_ind_score = int("ERROR" not in parsed.root_node.sexp())
|
||||
cumulative_parse_score+=inp_ind_score
|
||||
index_parse_list.append(inp_ind_score)
|
||||
return {"parse_score":cumulative_parse_score,"index_parse":index_parse_list}
|
||||
if __name__ == "__main__":
|
||||
Parse = check_parse()
|
||||
score = Parse(["""
|
||||
def a():
|
||||
if bar:
|
||||
baz()"""],"py")
|
||||
print(score)
|
BIN
metrics/tree_sitter_utils/.DS_Store
vendored
Normal file
BIN
metrics/tree_sitter_utils/.DS_Store
vendored
Normal file
Binary file not shown.
File diff suppressed because one or more lines are too long
102
scripts/evaluation/apps_utils/generate_gpt_codes.py
Normal file
102
scripts/evaluation/apps_utils/generate_gpt_codes.py
Normal file
@ -0,0 +1,102 @@
|
||||
# MIT License
|
||||
|
||||
# Copyright (c) 2021 Dan Hendrycks and contributors.
|
||||
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
|
||||
"""
|
||||
Run a tranined model to generate Python code.
|
||||
"""
|
||||
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
import random
|
||||
import numpy as np
|
||||
import os
|
||||
import pprint
|
||||
import sys
|
||||
import time
|
||||
import transformers
|
||||
import torch
|
||||
|
||||
from apps_utils.reindent import run as run_reindent
|
||||
|
||||
# for timing and debugging
|
||||
from datetime import datetime, date
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def reindent_code(codestr):
|
||||
"""
|
||||
Given code string, reindent it in the same way that the
|
||||
Github dataset was indented
|
||||
"""
|
||||
codestr = io.StringIO(codestr)
|
||||
ret = io.StringIO()
|
||||
|
||||
run_reindent(
|
||||
codestr,
|
||||
ret,
|
||||
config={
|
||||
"dry-run": False,
|
||||
"help": False,
|
||||
"to": 10,
|
||||
"from": -1,
|
||||
"tabs": True,
|
||||
"encoding": "utf-8",
|
||||
"is-tabs": False,
|
||||
"tabsize": 10,
|
||||
"all-tabs": False,
|
||||
},
|
||||
)
|
||||
|
||||
return ret.getvalue()
|
||||
|
||||
|
||||
def generate_prompt(
|
||||
test_case_path, prompt_path, solutions_path, tokenizer, starter_path=None
|
||||
):
|
||||
_input = "\nQUESTION:\n"
|
||||
with open(prompt_path, "r") as f:
|
||||
data = f.readlines()
|
||||
data = "".join(data)
|
||||
_input += data
|
||||
if starter_path != None:
|
||||
with open(starter_path, "r") as f:
|
||||
data = f.readlines()
|
||||
data = "".join(data)
|
||||
data = "\n" + data # + "\n"
|
||||
_input += data
|
||||
else:
|
||||
# _input += "\n\n"
|
||||
pass
|
||||
|
||||
with open(test_case_path, "r") as f:
|
||||
data = json.load(f)
|
||||
if not data.get("fn_name"):
|
||||
_input += "\nUse Standard Input format" # \n"
|
||||
else:
|
||||
_input += "\nUse Call-Based format" # \n"
|
||||
|
||||
_input += "\nANSWER:\n"
|
||||
|
||||
return _input
|
227
scripts/evaluation/apps_utils/reindent.py
Normal file
227
scripts/evaluation/apps_utils/reindent.py
Normal file
@ -0,0 +1,227 @@
|
||||
# MIT License
|
||||
|
||||
# Copyright (c) 2021 Dan Hendrycks and contributors.
|
||||
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
|
||||
"""
|
||||
Reindent files.
|
||||
"""
|
||||
|
||||
from __future__ import print_function
|
||||
import sys
|
||||
import getopt
|
||||
import codecs
|
||||
import tempfile
|
||||
import shutil
|
||||
import os
|
||||
|
||||
|
||||
def _find_indentation(line, config):
|
||||
if len(line) and line[0] in (" ", "\t") and not line.isspace():
|
||||
if line[0] == "\t":
|
||||
config['is-tabs'] = True
|
||||
# Find indentation
|
||||
i = 0
|
||||
for char in list(line):
|
||||
if char not in (" ", "\t"):
|
||||
break
|
||||
i += 1
|
||||
config["from"] = i
|
||||
|
||||
|
||||
def find_indentation(line, config):
|
||||
# Find indentation level used in file
|
||||
if config['from'] < 0:
|
||||
_find_indentation(line, config)
|
||||
|
||||
if config['from'] >= 0:
|
||||
# Set old indent
|
||||
indent = " " if not config['is-tabs'] else "\t"
|
||||
indent = indent * config['from']
|
||||
|
||||
# Set new indent
|
||||
newindent = " " if not config['tabs'] else "\t"
|
||||
if not config['tabs']:
|
||||
newindent = newindent * config['to']
|
||||
|
||||
return indent, newindent
|
||||
|
||||
# Continue to the next line, indentation not found
|
||||
return False
|
||||
|
||||
|
||||
def replace_inline_tabs(content, config):
|
||||
newcontent = ""
|
||||
imagined_i = 0
|
||||
for i in range(0, len(content)):
|
||||
char = content[i]
|
||||
if char == '\t':
|
||||
spaces = config['tabsize']-(imagined_i % config['tabsize'])
|
||||
newcontent += " " * spaces
|
||||
imagined_i += spaces
|
||||
else:
|
||||
newcontent += char
|
||||
imagined_i += 1
|
||||
return newcontent
|
||||
|
||||
|
||||
def run(fd_in, fd_out, config):
|
||||
from reindent_4_spaces import Reindenter
|
||||
import io
|
||||
|
||||
inter = io.StringIO()
|
||||
ri = Reindenter(fd_in)
|
||||
ri.run()
|
||||
ri.write(inter)
|
||||
fd_in = inter
|
||||
fd_in.seek(0)
|
||||
|
||||
while True:
|
||||
line = fd_in.readline()
|
||||
if not line:
|
||||
break
|
||||
line = line.rstrip('\r\n')
|
||||
|
||||
# Find indentation style used in file if not set
|
||||
if config['from'] < 0:
|
||||
indent = find_indentation(line, config)
|
||||
if not indent:
|
||||
print(line, file=fd_out)
|
||||
continue
|
||||
indent, newindent = indent
|
||||
|
||||
# Find current indentation level
|
||||
level = 0
|
||||
while True:
|
||||
whitespace = line[:len(indent) * (level + 1)]
|
||||
if whitespace == indent * (level + 1):
|
||||
level += 1
|
||||
else:
|
||||
break
|
||||
|
||||
content = line[len(indent) * level:]
|
||||
if config['all-tabs']:
|
||||
content = replace_inline_tabs(content, config)
|
||||
|
||||
line = (newindent * level) + content
|
||||
print(line, file=fd_out)
|
||||
# print(config)
|
||||
|
||||
|
||||
def run_files(filenames, config):
|
||||
for filename in filenames:
|
||||
with codecs.open(filename, encoding=config['encoding']) as fd_in:
|
||||
if config['dry-run']:
|
||||
print("Filename: %s" % filename)
|
||||
fd_out = sys.stdout
|
||||
else:
|
||||
fd_out = tempfile.NamedTemporaryFile(mode='wb', delete=False)
|
||||
fd_out.close()
|
||||
fd_out = codecs.open(fd_out.name, "wb", encoding=config['encoding'])
|
||||
|
||||
run(fd_in, fd_out, config)
|
||||
|
||||
if not config["dry-run"]:
|
||||
fd_out.close()
|
||||
shutil.copy(fd_out.name, filename)
|
||||
os.remove(fd_out.name)
|
||||
|
||||
|
||||
def main(args):
|
||||
config = {
|
||||
"dry-run": False,
|
||||
"help": False,
|
||||
"to": 4,
|
||||
"from": -1,
|
||||
"tabs": False,
|
||||
"encoding": "utf-8",
|
||||
"is-tabs": False,
|
||||
"tabsize": 4,
|
||||
"all-tabs": False
|
||||
}
|
||||
possible_args = {
|
||||
"d": "dry-run",
|
||||
"h": "help",
|
||||
"t:": "to=",
|
||||
"f:": "from=",
|
||||
"n": "tabs",
|
||||
"e:": "encoding=",
|
||||
"s:": "tabsize=",
|
||||
"a": "all-tabs",
|
||||
}
|
||||
optlist, filenames = getopt.getopt(
|
||||
args[1:],
|
||||
"".join(possible_args.keys()),
|
||||
possible_args.values()
|
||||
)
|
||||
|
||||
shortargs, longargs = [], []
|
||||
for shortarg in possible_args:
|
||||
shortargs.append(shortarg.rstrip(":"))
|
||||
longargs.append(possible_args[shortarg].rstrip("="))
|
||||
|
||||
for opt, val in optlist:
|
||||
opt = opt.lstrip("-")
|
||||
if opt in shortargs:
|
||||
opt = longargs[shortargs.index(opt)]
|
||||
if isinstance(config[opt], bool):
|
||||
config[opt] = True
|
||||
elif isinstance(config[opt], int):
|
||||
config[opt] = int(val)
|
||||
else:
|
||||
config[opt] = val
|
||||
|
||||
if config['help']:
|
||||
help = """
|
||||
Usage: %s [options] filename(s)
|
||||
Options:
|
||||
-h, --help Show this message
|
||||
-d, --dry-run Don't save anything, just print
|
||||
the result
|
||||
-t <n>, --to <n> Convert to this number of spaces
|
||||
(default: 4)
|
||||
-f <n>, --from <n> Convert from this number of spaces
|
||||
(default: auto-detect, will also
|
||||
detect tabs)
|
||||
-n, --tabs Don't convert indentation to spaces,
|
||||
convert to tabs instead. -t and
|
||||
--to will have no effect.
|
||||
-a, --all-tabs Also convert tabs used for alignment
|
||||
in the code (Warning: will replace
|
||||
all tabs in the file, even if inside
|
||||
a string)
|
||||
-s <n>, --tabsize <n> Set how many spaces one tab is
|
||||
(only has an effect on -a, default: 4)
|
||||
-e <s>, --encoding <s> Open files with specified encoding
|
||||
(default: utf-8)
|
||||
""" % args[0]
|
||||
|
||||
# Also removes 8 leading spaces to remove our indentation
|
||||
print("\n".join([x[8:] for x in help[1:].split("\n")]))
|
||||
sys.exit(0)
|
||||
|
||||
if filenames:
|
||||
run_files(filenames, config)
|
||||
else:
|
||||
run(sys.stdin, sys.stdout, config)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(sys.argv)
|
158
scripts/evaluation/apps_utils/test_one_solution.py
Normal file
158
scripts/evaluation/apps_utils/test_one_solution.py
Normal file
@ -0,0 +1,158 @@
|
||||
# MIT License
|
||||
|
||||
# Copyright (c) 2021 Dan Hendrycks and contributors.
|
||||
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
"""
|
||||
Run solutions from one problem.
|
||||
"""
|
||||
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
import numpy as np
|
||||
import os
|
||||
import pprint
|
||||
import sys
|
||||
import apps_utils.testing_util as test_util
|
||||
import time
|
||||
|
||||
# for timing debugging
|
||||
from datetime import datetime, date
|
||||
from pathlib import Path
|
||||
from tqdm import tqdm
|
||||
|
||||
from typing import List
|
||||
|
||||
|
||||
def print_results(results, args):
|
||||
res = []
|
||||
per_prob_res = []
|
||||
all_correct = []
|
||||
for index in results:
|
||||
res.extend(results[index])
|
||||
per_prob_res.append(np.mean(results[index]))
|
||||
all_correct.append(np.all(results[index]))
|
||||
tmp_results = res
|
||||
compile_errors = len(tmp_results[tmp_results == -2])
|
||||
runtime_errors = len(tmp_results[tmp_results == -1])
|
||||
failures = len(tmp_results[tmp_results == False])
|
||||
successes = len(tmp_results[tmp_results == True])
|
||||
total_testcases = len(res)
|
||||
if args.debug:
|
||||
print(
|
||||
f"number of compile errors = {compile_errors} avg = {compile_errors / total_testcases }"
|
||||
)
|
||||
print(
|
||||
f"number of runtime errors = {runtime_errors} avg = {runtime_errors / total_testcases}"
|
||||
)
|
||||
print(f"number of test cases run = {total_testcases}")
|
||||
|
||||
print(
|
||||
f"Test Case Average (average accuracy over problems) = {np.mean(per_prob_res)}"
|
||||
)
|
||||
print(
|
||||
f"Strict Accuracy (all test cases passed / total problems) = {np.mean(all_correct)}"
|
||||
)
|
||||
|
||||
|
||||
def eval_and_save_problems(test_loc, save):
|
||||
test_path = Path(test_loc)
|
||||
problems = list(test_path.glob("*/"))
|
||||
|
||||
print(len(problems))
|
||||
gpt_codes = {}
|
||||
gpt_bleu = {}
|
||||
gpt_codebleu = {}
|
||||
results = {}
|
||||
codes_loc = os.path.join(save, f"all_codes.json")
|
||||
# if not os.path.exists(codes_loc):
|
||||
# codes_loc = os.path.join(args.save, f"{args.start}-{args.end}_codes.json")
|
||||
|
||||
if os.path.exists(codes_loc):
|
||||
results_loc = os.path.join(save, f"all_results.json")
|
||||
print(codes_loc, results_loc)
|
||||
|
||||
with open(codes_loc, "r") as f:
|
||||
gpt_codes = json.load(f)
|
||||
|
||||
# main eval loop
|
||||
for index, problem in enumerate(tqdm(problems[:2])):
|
||||
try:
|
||||
# if args.debug:
|
||||
# print(f"\n\nproblem path = {problem}")
|
||||
output_str = gpt_codes[str(index)]
|
||||
except:
|
||||
print("CANNOT FIND OUTPUT_STR FOR", problem)
|
||||
continue
|
||||
prob_path = problem # os.path.join(args.root, problem)
|
||||
|
||||
# with open(os.path.join(prob_path, "solutions.json"), "r") as f:
|
||||
# sols = json.load(f)
|
||||
|
||||
if not os.path.exists(save):
|
||||
os.makedirs(save)
|
||||
|
||||
res = []
|
||||
# for o_idx, o in enumerate(output_str):
|
||||
# print(o)
|
||||
# if args.debug:
|
||||
# print(f"\nTesting solution {o_idx}")
|
||||
curr_res = [-2]
|
||||
try:
|
||||
curr_res = test_util.run_test(
|
||||
prob_path=prob_path, test=output_str, debug=False # args.debug
|
||||
)
|
||||
fixed = []
|
||||
for e in curr_res:
|
||||
if isinstance(e, np.ndarray):
|
||||
e = e.item(0)
|
||||
if isinstance(e, np.bool_):
|
||||
e = bool(e)
|
||||
fixed.append(e)
|
||||
curr_res = fixed
|
||||
if not np.all(curr_res):
|
||||
print(f"Results were not all True: {curr_res}")
|
||||
except Exception as e:
|
||||
print(f"test framework exception = {repr(e)}{e}\n")
|
||||
break
|
||||
finally:
|
||||
assert isinstance(curr_res, list)
|
||||
res.append(curr_res)
|
||||
|
||||
# if args.debug:
|
||||
# print(
|
||||
# f"\nHow to read results [-2] = compile error, [-1] = runtime error [False] = failed test case [True] = passed test case"
|
||||
# )
|
||||
# print(f"results = {res}")
|
||||
|
||||
results[index] = res
|
||||
|
||||
with open(results_loc, "w") as f:
|
||||
try:
|
||||
f.write(json.dumps(results))
|
||||
except Exception as e:
|
||||
import pdb
|
||||
|
||||
pdb.set_trace()
|
||||
print("didn't save problem due to {e}")
|
||||
|
||||
return results
|
544
scripts/evaluation/apps_utils/testing_util.py
Normal file
544
scripts/evaluation/apps_utils/testing_util.py
Normal file
@ -0,0 +1,544 @@
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import io
|
||||
import faulthandler
|
||||
|
||||
# used for debugging to time steps
|
||||
from datetime import datetime
|
||||
|
||||
# to run the solution files we're using a timing based approach
|
||||
import signal
|
||||
|
||||
import numpy as np
|
||||
# for capturing the stdout
|
||||
from io import StringIO
|
||||
from typing import get_type_hints
|
||||
from typing import List, Tuple
|
||||
# used for testing the code that reads from input
|
||||
from unittest.mock import patch, mock_open
|
||||
|
||||
from pyext import RuntimeModule
|
||||
|
||||
from enum import Enum
|
||||
class CODE_TYPE(Enum):
|
||||
call_based = 0
|
||||
standard_input = 1
|
||||
|
||||
# stuff for setting up signal timer
|
||||
class TimeoutException(Exception):
|
||||
pass
|
||||
def timeout_handler(signum, frame):
|
||||
print("alarm went off")
|
||||
#return
|
||||
raise TimeoutException
|
||||
signal.signal(signal.SIGALRM, timeout_handler)
|
||||
timeout = 4 # seconds
|
||||
|
||||
# used to capture stdout as a list
|
||||
# from https://stackoverflow.com/a/16571630/6416660
|
||||
# alternative use redirect_stdout() from contextlib
|
||||
class Capturing(list):
|
||||
def __enter__(self):
|
||||
self._stdout = sys.stdout
|
||||
sys.stdout = self._stringio = StringIO()
|
||||
# Make closing the StringIO a no-op
|
||||
self._stringio.close = lambda x: 1
|
||||
return self
|
||||
def __exit__(self, *args):
|
||||
self.extend(self._stringio.getvalue().splitlines())
|
||||
del self._stringio # free up some memory
|
||||
sys.stdout = self._stdout
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="Utility for testing code generation.")
|
||||
parser.add_argument("-v", "--verbosity-level", action="store", type=int,
|
||||
help="")
|
||||
parser.add_argument("-s", "--source", type=str, default="leetcode",
|
||||
choices=["leetcode", "atcoder", "codewars",],
|
||||
help="which data source to gather from.")
|
||||
parser.add_argument("-d", "--data", type=str, default="question",
|
||||
choices=["question", "q", "solutions", "sol", "s", "starter", "tests", "t"],
|
||||
help="which type of data to receive.")
|
||||
parser.add_argument("-n", "--number", type=int, default=0,
|
||||
help="which problem to query.")
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def get_valid_problems(data_dir="leetcode"):
|
||||
# these are unnecessary atm
|
||||
if data_dir == "leetcode":
|
||||
root = os.path.join(args.source, "data")
|
||||
elif data_dir == "atcoder":
|
||||
pass
|
||||
|
||||
root = os.path.join(data_dir, "data")
|
||||
if os.path.exists(os.path.join(data_dir, "valid_problems.json")):
|
||||
with open(os.path.join(data_dir, "valid_problems.json"), "r") as f:
|
||||
return json.load(f)
|
||||
|
||||
# after we compute it once let's save it and load that instead
|
||||
# TODO determine if might be better to reload each time
|
||||
tmp = os.listdir(root)
|
||||
valid_probs = []
|
||||
for folder in tmp:
|
||||
prob_path = os.path.join(root, folder)
|
||||
files = os.listdir(prob_path)
|
||||
#TODO add more validity checks
|
||||
if "input_output.json" in files or "sols.json" in files:
|
||||
valid_probs.append(prob_path)
|
||||
valid_probs = sorted(valid_probs)
|
||||
#with open(os.path.join(args.source,"valid_problems.json"), "w") as f:
|
||||
# json.dump(valid_probs, f)
|
||||
return valid_probs
|
||||
|
||||
|
||||
def get_question(problem_list, prob_index):
|
||||
root = problem_list[prob_index]
|
||||
#print("get q", root)
|
||||
if os.path.exists(os.path.join(root, "question.txt")):
|
||||
with open(os.path.join(root, "question.txt")) as f:
|
||||
question = f.readlines()
|
||||
else:
|
||||
print("question prompt not found")
|
||||
question = ""
|
||||
question = "".join(question)
|
||||
return question
|
||||
|
||||
|
||||
def get_solutions(problem_list, prob_index):
|
||||
root = problem_list[prob_index]
|
||||
if os.path.exists(os.path.join(root, "solutions.json")):
|
||||
with open(os.path.join(root, "solutions.json")) as f:
|
||||
sols = json.load(f)
|
||||
return sols
|
||||
|
||||
|
||||
def run_test(prob_path:str=None, problem_list:List[str]=None, prob_index:int=None,
|
||||
test:str=None, debug:bool=False):
|
||||
"""
|
||||
if test is not None it'll try to run the code.
|
||||
otherwise it'll just return an input and output pair.
|
||||
"""
|
||||
if prob_path is None and problem_list is None:
|
||||
print("please provide either prob_path or problem_list")
|
||||
exit()
|
||||
|
||||
if debug:
|
||||
print(f"start = {datetime.now().time()}")
|
||||
if prob_path is not None:
|
||||
root = prob_path
|
||||
elif problem_list is not None:
|
||||
root = problem_list[prob_index]
|
||||
|
||||
if os.path.exists(os.path.join(root, "input_output.json")):
|
||||
with open(os.path.join(root, "input_output.json")) as f:
|
||||
in_outs = json.load(f)
|
||||
if debug:
|
||||
print(f"test cases json = {in_outs['inputs']} {in_outs['outputs']}")
|
||||
|
||||
if in_outs.get("fn_name") is None:
|
||||
which_type = CODE_TYPE.standard_input # Standard input
|
||||
method_name = None
|
||||
else:
|
||||
which_type = CODE_TYPE.call_based # Call-based
|
||||
method_name = in_outs["fn_name"]
|
||||
if debug:
|
||||
print(f"loaded json = {datetime.now().time()}")
|
||||
|
||||
#else:
|
||||
# continue
|
||||
if test is None:
|
||||
return in_outs
|
||||
elif test is not None:
|
||||
results = []
|
||||
sol = "import sys\nimport time\nimport itertools\nfrom itertools import accumulate, product, permutations, combinations\nimport collections\nfrom collections import Counter, OrderedDict, deque, defaultdict, ChainMap\nfrom functools import lru_cache\nimport math\nfrom math import sqrt, sin, cos, tan, ceil, fabs, floor, gcd, exp, log, log2\nimport fractions\nfrom typing import List, Tuple\nimport numpy as np\nimport random\nimport heapq\nfrom heapq import *\n"
|
||||
if debug:
|
||||
print(f"loading test code = {datetime.now().time()}")
|
||||
|
||||
if which_type == CODE_TYPE.call_based:
|
||||
sol += test
|
||||
if debug: # or True:
|
||||
print(f"sol = {sol}")
|
||||
signal.alarm(timeout)
|
||||
try:
|
||||
tmp_sol = RuntimeModule.from_string("tmp_sol", "", sol)
|
||||
if "class Solution" not in test:
|
||||
tmp = tmp_sol
|
||||
else:
|
||||
tmp = tmp_sol.Solution()
|
||||
signal.alarm(0)
|
||||
except Exception as e:
|
||||
signal.alarm(0)
|
||||
print(f"type 0 compilation error = {e}")
|
||||
results.append(-2)
|
||||
return results
|
||||
signal.alarm(0)
|
||||
|
||||
elif which_type == CODE_TYPE.standard_input:
|
||||
# sol
|
||||
tmp_test = test.split("\n")
|
||||
|
||||
new_test = []
|
||||
for x in tmp_test:
|
||||
if (not x.startswith("from ")) and (not x.startswith("import ")):
|
||||
new_test.append("\t" + x + "\n")
|
||||
else:
|
||||
new_test.append(x + "\n")
|
||||
tmp_test = new_test
|
||||
|
||||
new_test = ""
|
||||
started = False
|
||||
for i in tmp_test:
|
||||
if i.startswith("\t") and not started:
|
||||
new_test += "stdin = sys.stdin\nstdout = sys.stdout\n"
|
||||
new_test += "def code():\n"
|
||||
new_test += i
|
||||
started = True
|
||||
elif started and ((i.startswith("from ")) or (i.startswith("import "))):
|
||||
new_test += "\t" + i
|
||||
else:
|
||||
new_test += i
|
||||
tmp_test = new_test
|
||||
|
||||
sol += tmp_test
|
||||
if debug:
|
||||
print(f"sol = {sol}")
|
||||
# print(f"{o}")
|
||||
method_name = "code"
|
||||
signal.alarm(timeout)
|
||||
try:
|
||||
tmp_sol = RuntimeModule.from_string("tmp_sol", "", sol)
|
||||
tmp = tmp_sol
|
||||
signal.alarm(0)
|
||||
except Exception as e:
|
||||
signal.alarm(0)
|
||||
print(f"type 1 compilation error = {e}")
|
||||
results.append(-2)
|
||||
return results
|
||||
signal.alarm(0)
|
||||
if debug:
|
||||
print(f"get method = {datetime.now().time()}")
|
||||
|
||||
try:
|
||||
method = getattr(tmp, method_name) # get_attr second arg must be str
|
||||
except:
|
||||
signal.alarm(0)
|
||||
e = sys.exc_info()
|
||||
print(f"unable to get function error = {e}")
|
||||
return results
|
||||
|
||||
for index, inputs in enumerate(in_outs["inputs"]):
|
||||
# JSON forces dictionaries to have string keys; this undoes this (assuming a singleton list)
|
||||
try:
|
||||
if isinstance(inputs[0], dict):
|
||||
inputs = [{int(k): v for k,v in inputs[0].items()}]
|
||||
except:
|
||||
True
|
||||
try:
|
||||
if isinstance(in_outs["outputs"][index], dict):
|
||||
in_outs["outputs"][index] = [{int(k): v for k,v in in_outs["outputs"][index].items()}]
|
||||
except:
|
||||
True
|
||||
try:
|
||||
if isinstance(in_outs["outputs"][index][0], dict):
|
||||
in_outs["outputs"][index] = [{int(k): v for k,v in in_outs["outputs"][index][0].items()}]
|
||||
except:
|
||||
True
|
||||
|
||||
if debug:
|
||||
print(f"time: {datetime.now().time()} testing index = {index} inputs = {inputs}, {type(inputs)}. type = {which_type}")
|
||||
if which_type == CODE_TYPE.call_based: # Call-based
|
||||
signal.alarm(timeout)
|
||||
faulthandler.enable()
|
||||
try:
|
||||
# print("------------")
|
||||
# print(inputs)
|
||||
output = method(*inputs)
|
||||
|
||||
# ground truth sequences are not tuples
|
||||
if isinstance(output, tuple):
|
||||
output = list(output)
|
||||
|
||||
tmp_result = output == in_outs["outputs"][index]
|
||||
if isinstance(in_outs["outputs"][index], list) and in_outs["outputs"][index]:
|
||||
tmp_result = tmp_result or (output == in_outs["outputs"][index][0])
|
||||
|
||||
# ground truth sequences are not tuples
|
||||
try:
|
||||
if isinstance(output[0], tuple):
|
||||
tmp_result = tmp_result or ([list(x) for x in output] == in_outs["outputs"][index][0])
|
||||
except:
|
||||
True
|
||||
results.append(tmp_result)
|
||||
|
||||
# reset the alarm
|
||||
signal.alarm(0)
|
||||
except Exception as e:
|
||||
signal.alarm(0)
|
||||
faulthandler.disable()
|
||||
print(f"Standard input runtime error or time limit exceeded error = {e}")
|
||||
results.append(-1)
|
||||
continue
|
||||
faulthandler.disable()
|
||||
signal.alarm(0)
|
||||
if debug:
|
||||
print(f"outputs = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
|
||||
elif which_type == CODE_TYPE.standard_input: # Standard input
|
||||
faulthandler.enable()
|
||||
signal.alarm(timeout)
|
||||
passed = False
|
||||
|
||||
if isinstance(inputs, list):
|
||||
inputs = "\n".join(inputs)
|
||||
if isinstance(in_outs['outputs'][index], list):
|
||||
in_outs['outputs'][index] = "\n".join(in_outs['outputs'][index])
|
||||
|
||||
with Capturing() as output:
|
||||
try:
|
||||
call_method(method, inputs)
|
||||
# reset the alarm
|
||||
signal.alarm(0)
|
||||
passed = True
|
||||
except Exception as e:
|
||||
# runtime error or took too long
|
||||
signal.alarm(0)
|
||||
print(f"Call-based runtime error or time limit exceeded error = {repr(e)}{e}")
|
||||
results.append(-1)
|
||||
signal.alarm(0)
|
||||
|
||||
if not passed:
|
||||
if debug:
|
||||
nl = "\n"
|
||||
if not isinstance(inputs, list):
|
||||
print(f"not passed output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
|
||||
else:
|
||||
print(f"not passed output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
|
||||
continue
|
||||
|
||||
if passed and debug:
|
||||
print(f"==> output = {output}, test outputs = {in_outs['outputs'][index]}")
|
||||
|
||||
if custom_compare_(output, in_outs['outputs'][index]):
|
||||
tmp_result = True
|
||||
results.append(tmp_result)
|
||||
continue
|
||||
|
||||
# ground truth sequences are expressed as lists not tuples
|
||||
if isinstance(output, tuple):
|
||||
output = list(output)
|
||||
|
||||
tmp_result = False
|
||||
try:
|
||||
tmp_result = (output == [in_outs["outputs"][index]])
|
||||
if isinstance(in_outs["outputs"][index], list):
|
||||
tmp_result = tmp_result or (output == in_outs["outputs"][index])
|
||||
if isinstance(output[0], str):
|
||||
tmp_result = tmp_result or ([e.strip() for e in output] == in_outs["outputs"][index])
|
||||
except Exception as e:
|
||||
print(f"Failed check1 exception = {e}")
|
||||
pass
|
||||
|
||||
if tmp_result == True:
|
||||
results.append(tmp_result)
|
||||
continue
|
||||
|
||||
# try one more time without \n
|
||||
if isinstance(in_outs["outputs"][index], list):
|
||||
for tmp_index, i in enumerate(in_outs["outputs"][index]):
|
||||
in_outs["outputs"][index][tmp_index] = i.split("\n")
|
||||
in_outs["outputs"][index][tmp_index] = [x.strip() for x in in_outs["outputs"][index][tmp_index] if x]
|
||||
else:
|
||||
in_outs["outputs"][index] = in_outs["outputs"][index].split("\n")
|
||||
in_outs["outputs"][index] = list(filter(len, in_outs["outputs"][index]))
|
||||
in_outs["outputs"][index] = list(map(lambda x:x.strip(), in_outs["outputs"][index]))
|
||||
|
||||
try:
|
||||
tmp_result = (output == [in_outs["outputs"][index]])
|
||||
if isinstance(in_outs["outputs"][index], list):
|
||||
tmp_result = tmp_result or (output == in_outs["outputs"][index])
|
||||
except Exception as e:
|
||||
print(f"Failed check2 exception = {e}")
|
||||
pass
|
||||
|
||||
if tmp_result == True:
|
||||
results.append(tmp_result)
|
||||
continue
|
||||
|
||||
# try by converting the output into a split up list too
|
||||
if isinstance(output, list):
|
||||
output = list(filter(len, output))
|
||||
|
||||
if debug:
|
||||
nl = "\n"
|
||||
if not isinstance(inputs, list):
|
||||
print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
|
||||
else:
|
||||
print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
|
||||
|
||||
if tmp_result == True:
|
||||
results.append(tmp_result)
|
||||
continue
|
||||
|
||||
try:
|
||||
tmp_result = (output == [in_outs["outputs"][index]])
|
||||
if isinstance(in_outs["outputs"][index], list):
|
||||
tmp_result = tmp_result or (output == in_outs["outputs"][index])
|
||||
except Exception as e:
|
||||
print(f"Failed check3 exception = {e}")
|
||||
pass
|
||||
|
||||
try:
|
||||
output_float = [float(e) for e in output]
|
||||
gt_float = [float(e) for e in in_outs['outputs'][index]]
|
||||
tmp_result = tmp_result or ((len(output_float) == len(gt_float)) and np.allclose(output_float, gt_float))
|
||||
except Exception as e:
|
||||
pass
|
||||
try:
|
||||
if isinstance(output[0], list):
|
||||
output_float = [float(e) for e in output[0]]
|
||||
gt_float = [float(e) for e in in_outs['outputs'][index][0]]
|
||||
tmp_result = tmp_result or ((len(output_float) == len(gt_float)) and np.allclose(output_float, gt_float))
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
if tmp_result == True:
|
||||
results.append(tmp_result)
|
||||
continue
|
||||
|
||||
# try by converting the stuff into split up list
|
||||
if isinstance(in_outs["outputs"][index], list):
|
||||
for tmp_index, i in enumerate(in_outs["outputs"][index]):
|
||||
in_outs["outputs"][index][tmp_index] = set(i.split())
|
||||
else:
|
||||
in_outs["outputs"][index] = set(in_outs["outputs"][index].split())
|
||||
|
||||
try:
|
||||
tmp_result = (output == in_outs["outputs"][index])
|
||||
except Exception as e:
|
||||
print(f"Failed check4 exception = {e}")
|
||||
continue
|
||||
|
||||
if tmp_result == True:
|
||||
results.append(tmp_result)
|
||||
continue
|
||||
|
||||
# try by converting the output into a split up list too
|
||||
if isinstance(output, list):
|
||||
for tmp_index, i in enumerate(output):
|
||||
output[tmp_index] = i.split()
|
||||
output = list(filter(len, output))
|
||||
for tmp_index, i in enumerate(output):
|
||||
output[tmp_index] = set(i)
|
||||
else:
|
||||
output = output.split()
|
||||
output = list(filter(len, output))
|
||||
output = set(output)
|
||||
|
||||
try:
|
||||
tmp_result = (set(frozenset(s) for s in output) == set(frozenset(s) for s in in_outs["outputs"][index]))
|
||||
except Exception as e:
|
||||
print(f"Failed check5 exception = {e}")
|
||||
|
||||
|
||||
# if they are all numbers, round so that similar numbers are treated as identical
|
||||
try:
|
||||
tmp_result = tmp_result or (set(frozenset(round(float(t),3) for t in s) for s in output) ==\
|
||||
set(frozenset(round(float(t),3) for t in s) for s in in_outs["outputs"][index]))
|
||||
except Exception as e:
|
||||
print(f"Failed check6 exception = {e}")
|
||||
|
||||
if tmp_result == True and debug:
|
||||
print("PASSED")
|
||||
|
||||
results.append(tmp_result)
|
||||
|
||||
if debug:
|
||||
nl = "\n"
|
||||
if not isinstance(inputs, list):
|
||||
print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
|
||||
else:
|
||||
print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
|
||||
|
||||
|
||||
return results
|
||||
|
||||
def custom_compare_(output, ground_truth):
|
||||
|
||||
if isinstance(output, list):
|
||||
output_1 = "\n".join(output)
|
||||
if stripped_string_compare(output_1, ground_truth):
|
||||
return True
|
||||
|
||||
if isinstance(output, list):
|
||||
output_2 = [o.lstrip().rstrip() for o in output]
|
||||
output_2 = "\n".join(output_2)
|
||||
if stripped_string_compare(output_2, ground_truth):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def stripped_string_compare(s1, s2):
|
||||
s1 = s1.lstrip().rstrip()
|
||||
s2 = s2.lstrip().rstrip()
|
||||
return s1 == s2
|
||||
|
||||
def call_method(method, inputs):
|
||||
|
||||
if isinstance(inputs, list):
|
||||
inputs = "\n".join(inputs)
|
||||
|
||||
inputs_line_iterator = iter(inputs.split("\n"))
|
||||
|
||||
# sys.setrecursionlimit(10000)
|
||||
|
||||
# @patch('builtins.input', side_effect=inputs.split("\n"))
|
||||
@patch('builtins.open', mock_open(read_data=inputs))
|
||||
@patch('sys.stdin', StringIO(inputs))
|
||||
@patch('sys.stdin.readline', lambda *args: next(inputs_line_iterator))
|
||||
@patch('sys.stdin.readlines', lambda *args: inputs.split("\n"))
|
||||
@patch('sys.stdin.read', lambda *args: inputs)
|
||||
# @patch('sys.stdout.write', print)
|
||||
def _inner_call_method(_method):
|
||||
try:
|
||||
return _method()
|
||||
except SystemExit as e:
|
||||
pass
|
||||
finally:
|
||||
pass
|
||||
return _inner_call_method(method)
|
||||
|
||||
def main(args):
|
||||
print(args)
|
||||
problem_list = sorted(get_valid_problems(args.source))
|
||||
print(f"number of problems = {len(problem_list)}")
|
||||
prob_index = args.number
|
||||
print(f"problem is {problem_list[prob_index]}")
|
||||
|
||||
# This checks it correctly loaded. remove this later
|
||||
assert prob_index < len(problem_list)
|
||||
|
||||
if args.data == "q" or args.data == "question":
|
||||
tmp = get_question(problem_list, prob_index)
|
||||
print("q", tmp)
|
||||
elif args.data in ["solutions", "sol", "s",]:
|
||||
tmp = get_solutions(problem_list, prob_index)
|
||||
print("sol", tmp)
|
||||
elif args.data == "starter":
|
||||
tmp = get_starter(problem_list, prob_index)
|
||||
print("starter", tmp)
|
||||
elif args.data in ["test", "t"]:
|
||||
# test it with sols
|
||||
sols = get_solutions(problem_list, prob_index)
|
||||
tmp = run_test(problem_list, prob_index, test=sols[0])
|
||||
|
||||
print("results = ", tmp)
|
||||
print("-2 = compile error, -1 is runtime error, False failed test, True passed test")
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
main(args)
|
4
scripts/evaluation/code_search_net.py
Normal file
4
scripts/evaluation/code_search_net.py
Normal file
@ -0,0 +1,4 @@
|
||||
from datasets import load_dataset
|
||||
|
||||
dataset = load_dataset("code_x_glue_ct_code_to_text", "go")
|
||||
print(dataset)
|
23
scripts/evaluation/concode.py
Normal file
23
scripts/evaluation/concode.py
Normal file
@ -0,0 +1,23 @@
|
||||
import pandas as pd
|
||||
|
||||
from datasets import load_dataset, load_metric
|
||||
from fastcore.script import *
|
||||
from pathlib import Path
|
||||
|
||||
bleu = load_metric("sacrebleu")
|
||||
|
||||
predictions = ["hello there kenobi", "foo bar foobar"]
|
||||
references = [
|
||||
["hello there general kenobi"],
|
||||
["foo bar foobar"], # , "hello there !"], # , "foo bar foobar"],
|
||||
]
|
||||
|
||||
|
||||
@call_parse
|
||||
def main(concode_path: Param("Path to the concode data in CodeXGLUE", str)):
|
||||
concode_path = Path(concode_path)
|
||||
dataset = load_dataset("json", data_files=str(concode_path / "test.json"))
|
||||
print(dataset)
|
||||
results = bleu.compute(predictions=predictions, references=references)
|
||||
print(list(results.keys()))
|
||||
print(round(results["score"], 1))
|
165
scripts/evaluation/evaluate.py
Normal file
165
scripts/evaluation/evaluate.py
Normal file
@ -0,0 +1,165 @@
|
||||
import json
|
||||
import torch
|
||||
import pandas as pd
|
||||
|
||||
# import apps.eval.reident
|
||||
|
||||
from apps_utils.generate_gpt_codes import generate_prompt
|
||||
from apps_utils.test_one_solution import eval_and_save_problems
|
||||
from datasets import load_dataset, load_metric
|
||||
from fastcore.script import *
|
||||
from human_eval.data import write_jsonl, read_problems
|
||||
from pathlib import Path
|
||||
from metrics.extrinsic_eval import compute_metrics
|
||||
from subprocess import check_output
|
||||
from transformers import AutoTokenizer, AutoModelWithLMHead
|
||||
|
||||
bleu = load_metric("sacrebleu")
|
||||
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
|
||||
model = AutoModelWithLMHead.from_pretrained(
|
||||
"/home/nathan/gpt-code-clippy/data/APPS/models/1.5B"
|
||||
)
|
||||
|
||||
|
||||
def generate_text(prompt):
|
||||
# print(prompt)
|
||||
input_ids = torch.LongTensor(tokenizer.encode(prompt, verbose=False)).unsqueeze(
|
||||
0
|
||||
) # .cuda()
|
||||
output_ids = model.generate(
|
||||
input_ids,
|
||||
num_beams=2,
|
||||
early_stopping=True,
|
||||
max_length=1024 - len(input_ids),
|
||||
)
|
||||
output_str = tokenizer.decode(output_ids[0])
|
||||
return output_str
|
||||
# # "a", "=", "b", "\n", "y", "=", "a", "+", "1"
|
||||
# return "a = b \n y = a + 1"
|
||||
|
||||
|
||||
def _eval_concode(path):
|
||||
# TODO: format input to model same as App and OpenAI HumanEval datasets are formatted
|
||||
data = load_dataset("json", data_files=str(path / "test.json"))["train"]
|
||||
predictions = [[]]
|
||||
references = []
|
||||
for example in data:
|
||||
output = generate_text(example["nl"])
|
||||
predictions[0].append(output.split(" "))
|
||||
references.append(example["code"].split(" "))
|
||||
results = compute_metrics(predictions, references)
|
||||
print(f"Bleu score for Concode dataset: {results}")
|
||||
|
||||
|
||||
def _eval_apps(path):
|
||||
gpt_codes = {}
|
||||
prob_paths = sorted(path.glob("*/"))
|
||||
# map prob_paths to strings and save as a json file
|
||||
str_paths = [str(p) for p in prob_paths]
|
||||
with open(path / "test.json", "w") as f:
|
||||
json.dump(str_paths, f)
|
||||
for index, prob_path in enumerate(prob_paths[:2]):
|
||||
test_case_path = prob_path / "input_output.json"
|
||||
prompt_path = prob_path / "question.txt"
|
||||
starter_path = prob_path / "starter_code.py"
|
||||
solutions_path = prob_path / "solutions.json"
|
||||
if not starter_path.exists():
|
||||
starter_path = None
|
||||
if not test_case_path.exists() or not prompt_path.exists():
|
||||
continue
|
||||
prompt = generate_prompt(
|
||||
Args(),
|
||||
test_case_path,
|
||||
prompt_path,
|
||||
solutions_path,
|
||||
tokenizer,
|
||||
starter_path=starter_path,
|
||||
)
|
||||
output = generate_text(prompt)
|
||||
print(output)
|
||||
# print(output)
|
||||
gpt_codes[index] = output
|
||||
# print(output)
|
||||
|
||||
with open(path.parent / "all_codes.json", "w") as f:
|
||||
json.dump(gpt_codes, f)
|
||||
|
||||
eval_and_save_problems(path, path.parent)
|
||||
|
||||
# execute bash command to run eval script
|
||||
# results = check_output(
|
||||
# [
|
||||
# # python3 test_one_solution.py -t /path/to/apps/test --save /path/to/save_dir --print_results
|
||||
# "python",
|
||||
# "./apps_utils/test_one_solution.py",
|
||||
# "-t",
|
||||
# str(path),
|
||||
# "--save",
|
||||
# str(path.parent),
|
||||
# "--print_results",
|
||||
# ]
|
||||
# ).decode("utf-8")
|
||||
|
||||
|
||||
# test_case_path = os.path.join(prob_path, "input_output.json")
|
||||
# prompt_path = os.path.join(prob_path, "question.txt")
|
||||
# starter_path = os.path.join(prob_path, "starter_code.py")
|
||||
# solutions_path = os.path.join(prob_path, "solutions.json")
|
||||
# generate_prompt(args, test_case_path, prompt_path, solutions_path, tokenizer, starter_path=None)
|
||||
|
||||
|
||||
def _eval_human_eval(path):
|
||||
problems = read_problems()
|
||||
num_samples_per_task = 1
|
||||
samples = [
|
||||
dict(
|
||||
task_id=task_id,
|
||||
completion=generate_text(problems[task_id]["prompt"]),
|
||||
)
|
||||
for task_id in problems
|
||||
for _ in range(num_samples_per_task)
|
||||
]
|
||||
write_jsonl("human_eval.jsonl", samples)
|
||||
# execute bash command to run eval script
|
||||
results = check_output(
|
||||
[
|
||||
"python",
|
||||
path / "evaluate_functional_correctness.py",
|
||||
"human_eval.jsonl",
|
||||
]
|
||||
).decode("utf-8")
|
||||
|
||||
print(results)
|
||||
|
||||
|
||||
@call_parse
|
||||
def main(
|
||||
concode_path: Param("Path to the concode data in CodeXGLUE", str),
|
||||
apps_path: Param("Path to the the App dataset", str),
|
||||
human_eval_path: Param("Path to the human eval dataset", str),
|
||||
):
|
||||
concode_path = Path(concode_path)
|
||||
apps_path = Path(apps_path)
|
||||
human_eval_path = Path(human_eval_path)
|
||||
# _eval_concode(concode_path)
|
||||
# _eval_human_eval(human_eval_path)
|
||||
_eval_apps(apps_path)
|
||||
# dataset = load_dataset("json", data_files=str(concode_path / "test.json"))
|
||||
# print(dataset)
|
||||
# results = bleu.compute(predictions=predictions, references=references)
|
||||
# print(list(results.keys()))
|
||||
# print(round(results["score"], 1))
|
||||
|
||||
|
||||
# problems = read_problems()
|
||||
# print(problems)
|
||||
# num_samples_per_task = 200
|
||||
# samples = [
|
||||
# dict(
|
||||
# task_id=task_id,
|
||||
# completion=generate_text(problems[task_id]["prompt"]),
|
||||
# )
|
||||
# for task_id in problems[:1]
|
||||
# for _ in range(num_samples_per_task)
|
||||
# ]
|
||||
# write_jsonl("human_eval.jsonl", samples)
|
164
scripts/evaluation/human_eval.jsonl
Normal file
164
scripts/evaluation/human_eval.jsonl
Normal file
@ -0,0 +1,164 @@
|
||||
{"task_id": "HumanEval/0", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/1", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/2", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/3", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/4", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/5", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/6", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/7", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/8", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/9", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/10", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/11", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/12", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/13", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/14", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/15", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/16", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/17", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/18", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/19", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/20", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/21", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/22", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/23", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/24", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/25", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/26", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/27", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/28", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/29", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/30", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/31", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/32", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/33", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/34", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/35", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/36", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/37", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/38", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/39", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/40", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/41", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/42", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/43", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/44", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/45", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/46", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/47", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/48", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/49", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/50", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/51", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/52", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/53", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/54", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/55", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/56", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/57", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/58", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/59", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/60", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/61", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/62", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/63", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/64", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/65", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/66", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/67", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/68", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/69", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/70", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/71", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/72", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/73", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/74", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/75", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/76", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/77", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/78", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/79", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/80", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/81", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/82", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/83", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/84", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/85", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/86", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/87", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/88", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/89", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/90", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/91", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/92", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/93", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/94", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/95", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/96", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/97", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/98", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/99", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/100", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/101", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/102", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/103", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/104", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/105", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/106", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/107", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/108", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/109", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/110", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/111", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/112", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/113", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/114", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/115", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/116", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/117", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/118", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/119", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/120", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/121", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/122", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/123", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/124", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/125", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/126", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/127", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/128", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/129", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/130", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/131", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/132", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/133", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/134", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/135", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/136", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/137", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/138", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/139", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/140", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/141", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/142", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/143", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/144", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/145", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/146", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/147", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/148", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/149", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/150", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/151", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/152", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/153", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/154", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/155", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/156", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/157", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/158", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/159", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/160", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/161", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/162", "completion": "a = b \n y = a + 1"}
|
||||
{"task_id": "HumanEval/163", "completion": "a = b \n y = a + 1"}
|
164
scripts/evaluation/human_eval.jsonl_results.jsonl
Normal file
164
scripts/evaluation/human_eval.jsonl_results.jsonl
Normal file
@ -0,0 +1,164 @@
|
||||
{"task_id": "HumanEval/0", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
|
||||
{"task_id": "HumanEval/1", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
|
||||
{"task_id": "HumanEval/2", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
|
||||
{"task_id": "HumanEval/3", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
|
||||
{"task_id": "HumanEval/4", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
|
||||
{"task_id": "HumanEval/5", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
|
||||
{"task_id": "HumanEval/6", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
|
||||
{"task_id": "HumanEval/7", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
|
||||
{"task_id": "HumanEval/8", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
|
||||
{"task_id": "HumanEval/9", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
|
||||
{"task_id": "HumanEval/10", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 21)", "passed": false}
|
||||
{"task_id": "HumanEval/11", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
|
||||
{"task_id": "HumanEval/12", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
|
||||
{"task_id": "HumanEval/13", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
|
||||
{"task_id": "HumanEval/14", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 10)", "passed": false}
|
||||
{"task_id": "HumanEval/15", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
|
||||
{"task_id": "HumanEval/16", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
|
||||
{"task_id": "HumanEval/17", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 18)", "passed": false}
|
||||
{"task_id": "HumanEval/18", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
|
||||
{"task_id": "HumanEval/19", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
|
||||
{"task_id": "HumanEval/20", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
|
||||
{"task_id": "HumanEval/21", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
|
||||
{"task_id": "HumanEval/22", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
|
||||
{"task_id": "HumanEval/23", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
|
||||
{"task_id": "HumanEval/24", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 9)", "passed": false}
|
||||
{"task_id": "HumanEval/25", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
|
||||
{"task_id": "HumanEval/26", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
|
||||
{"task_id": "HumanEval/27", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 9)", "passed": false}
|
||||
{"task_id": "HumanEval/28", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
|
||||
{"task_id": "HumanEval/29", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
|
||||
{"task_id": "HumanEval/30", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
|
||||
{"task_id": "HumanEval/31", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 21)", "passed": false}
|
||||
{"task_id": "HumanEval/32", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 25)", "passed": false}
|
||||
{"task_id": "HumanEval/33", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
|
||||
{"task_id": "HumanEval/34", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 9)", "passed": false}
|
||||
{"task_id": "HumanEval/35", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
|
||||
{"task_id": "HumanEval/36", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
|
||||
{"task_id": "HumanEval/37", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
|
||||
{"task_id": "HumanEval/38", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 19)", "passed": false}
|
||||
{"task_id": "HumanEval/39", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 18)", "passed": false}
|
||||
{"task_id": "HumanEval/40", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 21)", "passed": false}
|
||||
{"task_id": "HumanEval/41", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
|
||||
{"task_id": "HumanEval/42", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
|
||||
{"task_id": "HumanEval/43", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 20)", "passed": false}
|
||||
{"task_id": "HumanEval/44", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
|
||||
{"task_id": "HumanEval/45", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 9)", "passed": false}
|
||||
{"task_id": "HumanEval/46", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 19)", "passed": false}
|
||||
{"task_id": "HumanEval/47", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
|
||||
{"task_id": "HumanEval/48", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
|
||||
{"task_id": "HumanEval/49", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
|
||||
{"task_id": "HumanEval/50", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
|
||||
{"task_id": "HumanEval/51", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 20)", "passed": false}
|
||||
{"task_id": "HumanEval/52", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
|
||||
{"task_id": "HumanEval/53", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
|
||||
{"task_id": "HumanEval/54", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 20)", "passed": false}
|
||||
{"task_id": "HumanEval/55", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
|
||||
{"task_id": "HumanEval/56", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
|
||||
{"task_id": "HumanEval/57", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
|
||||
{"task_id": "HumanEval/58", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
|
||||
{"task_id": "HumanEval/59", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
|
||||
{"task_id": "HumanEval/60", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
|
||||
{"task_id": "HumanEval/61", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
|
||||
{"task_id": "HumanEval/62", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
|
||||
{"task_id": "HumanEval/63", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 18)", "passed": false}
|
||||
{"task_id": "HumanEval/64", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 19)", "passed": false}
|
||||
{"task_id": "HumanEval/65", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
|
||||
{"task_id": "HumanEval/66", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
|
||||
{"task_id": "HumanEval/67", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
|
||||
{"task_id": "HumanEval/68", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 37)", "passed": false}
|
||||
{"task_id": "HumanEval/69", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
|
||||
{"task_id": "HumanEval/70", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
|
||||
{"task_id": "HumanEval/71", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
|
||||
{"task_id": "HumanEval/72", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 21)", "passed": false}
|
||||
{"task_id": "HumanEval/73", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
|
||||
{"task_id": "HumanEval/74", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
|
||||
{"task_id": "HumanEval/75", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
|
||||
{"task_id": "HumanEval/76", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
|
||||
{"task_id": "HumanEval/77", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
|
||||
{"task_id": "HumanEval/78", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 21)", "passed": false}
|
||||
{"task_id": "HumanEval/79", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
|
||||
{"task_id": "HumanEval/80", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
|
||||
{"task_id": "HumanEval/81", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 28)", "passed": false}
|
||||
{"task_id": "HumanEval/82", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
|
||||
{"task_id": "HumanEval/83", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 8)", "passed": false}
|
||||
{"task_id": "HumanEval/84", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
|
||||
{"task_id": "HumanEval/85", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 10)", "passed": false}
|
||||
{"task_id": "HumanEval/86", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
|
||||
{"task_id": "HumanEval/87", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 23)", "passed": false}
|
||||
{"task_id": "HumanEval/88", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 18)", "passed": false}
|
||||
{"task_id": "HumanEval/89", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
|
||||
{"task_id": "HumanEval/90", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
|
||||
{"task_id": "HumanEval/91", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
|
||||
{"task_id": "HumanEval/92", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 21)", "passed": false}
|
||||
{"task_id": "HumanEval/93", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
|
||||
{"task_id": "HumanEval/94", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
|
||||
{"task_id": "HumanEval/95", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
|
||||
{"task_id": "HumanEval/96", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
|
||||
{"task_id": "HumanEval/97", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
|
||||
{"task_id": "HumanEval/98", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
|
||||
{"task_id": "HumanEval/99", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 21)", "passed": false}
|
||||
{"task_id": "HumanEval/100", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
|
||||
{"task_id": "HumanEval/101", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
|
||||
{"task_id": "HumanEval/102", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
|
||||
{"task_id": "HumanEval/103", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
|
||||
{"task_id": "HumanEval/104", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
|
||||
{"task_id": "HumanEval/105", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 25)", "passed": false}
|
||||
{"task_id": "HumanEval/106", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
|
||||
{"task_id": "HumanEval/107", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 26)", "passed": false}
|
||||
{"task_id": "HumanEval/108", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
|
||||
{"task_id": "HumanEval/109", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 30)", "passed": false}
|
||||
{"task_id": "HumanEval/110", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
|
||||
{"task_id": "HumanEval/111", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
|
||||
{"task_id": "HumanEval/112", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
|
||||
{"task_id": "HumanEval/113", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
|
||||
{"task_id": "HumanEval/114", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
|
||||
{"task_id": "HumanEval/115", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 38)", "passed": false}
|
||||
{"task_id": "HumanEval/116", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
|
||||
{"task_id": "HumanEval/117", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
|
||||
{"task_id": "HumanEval/118", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 18)", "passed": false}
|
||||
{"task_id": "HumanEval/119", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 18)", "passed": false}
|
||||
{"task_id": "HumanEval/120", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 28)", "passed": false}
|
||||
{"task_id": "HumanEval/121", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
|
||||
{"task_id": "HumanEval/122", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
|
||||
{"task_id": "HumanEval/123", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 20)", "passed": false}
|
||||
{"task_id": "HumanEval/124", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 23)", "passed": false}
|
||||
{"task_id": "HumanEval/125", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
|
||||
{"task_id": "HumanEval/126", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 19)", "passed": false}
|
||||
{"task_id": "HumanEval/127", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 23)", "passed": false}
|
||||
{"task_id": "HumanEval/128", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
|
||||
{"task_id": "HumanEval/129", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 33)", "passed": false}
|
||||
{"task_id": "HumanEval/130", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 20)", "passed": false}
|
||||
{"task_id": "HumanEval/131", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
|
||||
{"task_id": "HumanEval/132", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
|
||||
{"task_id": "HumanEval/133", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
|
||||
{"task_id": "HumanEval/134", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
|
||||
{"task_id": "HumanEval/135", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
|
||||
{"task_id": "HumanEval/136", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
|
||||
{"task_id": "HumanEval/137", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
|
||||
{"task_id": "HumanEval/138", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 10)", "passed": false}
|
||||
{"task_id": "HumanEval/139", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
|
||||
{"task_id": "HumanEval/140", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
|
||||
{"task_id": "HumanEval/141", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
|
||||
{"task_id": "HumanEval/142", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
|
||||
{"task_id": "HumanEval/143", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 23)", "passed": false}
|
||||
{"task_id": "HumanEval/144", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
|
||||
{"task_id": "HumanEval/145", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
|
||||
{"task_id": "HumanEval/146", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
|
||||
{"task_id": "HumanEval/147", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
|
||||
{"task_id": "HumanEval/148", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 19)", "passed": false}
|
||||
{"task_id": "HumanEval/149", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 18)", "passed": false}
|
||||
{"task_id": "HumanEval/150", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
|
||||
{"task_id": "HumanEval/151", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
|
||||
{"task_id": "HumanEval/152", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 18)", "passed": false}
|
||||
{"task_id": "HumanEval/153", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 20)", "passed": false}
|
||||
{"task_id": "HumanEval/154", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
|
||||
{"task_id": "HumanEval/155", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 10)", "passed": false}
|
||||
{"task_id": "HumanEval/156", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
|
||||
{"task_id": "HumanEval/157", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
|
||||
{"task_id": "HumanEval/158", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
|
||||
{"task_id": "HumanEval/159", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 32)", "passed": false}
|
||||
{"task_id": "HumanEval/160", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 28)", "passed": false}
|
||||
{"task_id": "HumanEval/161", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
|
||||
{"task_id": "HumanEval/162", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 10)", "passed": false}
|
||||
{"task_id": "HumanEval/163", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
|
0
scripts/evaluation/human_eval_bench.py
Normal file
0
scripts/evaluation/human_eval_bench.py
Normal file
133
scripts/evaluation/metrics/bleu.py
Normal file
133
scripts/evaluation/metrics/bleu.py
Normal file
@ -0,0 +1,133 @@
|
||||
# Copyright 2017 Google Inc. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
# The following code is taken from CodeXGlue Repository - https://github.com/microsoft/CodeXGLUE/blob/main/Code-Code/code-to-code-trans/evaluator/CodeBLEU/bleu.py
|
||||
|
||||
|
||||
"""Python implementation of BLEU and smooth-BLEU.
|
||||
|
||||
This module provides a Python implementation of BLEU and smooth-BLEU.
|
||||
Smooth BLEU is computed following the method outlined in the paper:
|
||||
Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
|
||||
evaluation metrics for machine translation. COLING 2004.
|
||||
"""
|
||||
|
||||
import collections
|
||||
import math
|
||||
|
||||
|
||||
def _get_ngrams(segment, max_order):
|
||||
"""Extracts all n-grams upto a given maximum order from an input segment.
|
||||
|
||||
Args:
|
||||
segment: text segment from which n-grams will be extracted.
|
||||
max_order: maximum length in tokens of the n-grams returned by this
|
||||
methods.
|
||||
|
||||
Returns:
|
||||
The Counter containing all n-grams upto max_order in segment
|
||||
with a count of how many times each n-gram occurred.
|
||||
"""
|
||||
ngram_counts = collections.Counter()
|
||||
for order in range(1, max_order + 1):
|
||||
for i in range(0, len(segment) - order + 1):
|
||||
ngram = tuple(segment[i : i + order])
|
||||
ngram_counts[ngram] += 1
|
||||
return ngram_counts
|
||||
|
||||
|
||||
def compute_bleu(reference_corpus, translation_corpus, max_order=4, smooth=True):
|
||||
"""Computes BLEU score of translated segments against one or more references.
|
||||
|
||||
Args:
|
||||
reference_corpus: list of lists of references for each translation. Each
|
||||
reference should be tokenized into a list of tokens.
|
||||
translation_corpus: list of translations to score. Each translation
|
||||
should be tokenized into a list of tokens.
|
||||
max_order: Maximum n-gram order to use when computing BLEU score.
|
||||
smooth: Whether or not to apply Lin et al. 2004 smoothing.
|
||||
|
||||
Returns:
|
||||
3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
|
||||
precisions and brevity penalty.
|
||||
"""
|
||||
matches_by_order = [0] * max_order
|
||||
possible_matches_by_order = [0] * max_order
|
||||
reference_length = 0
|
||||
translation_length = 0
|
||||
for (references, translation) in zip(reference_corpus, translation_corpus):
|
||||
reference_length += min(len(r) for r in references)
|
||||
translation_length += len(translation)
|
||||
|
||||
merged_ref_ngram_counts = collections.Counter()
|
||||
for reference in references:
|
||||
merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
|
||||
translation_ngram_counts = _get_ngrams(translation, max_order)
|
||||
overlap = translation_ngram_counts & merged_ref_ngram_counts
|
||||
for ngram in overlap:
|
||||
matches_by_order[len(ngram) - 1] += overlap[ngram]
|
||||
for order in range(1, max_order + 1):
|
||||
possible_matches = len(translation) - order + 1
|
||||
if possible_matches > 0:
|
||||
possible_matches_by_order[order - 1] += possible_matches
|
||||
|
||||
precisions = [0] * max_order
|
||||
for i in range(0, max_order):
|
||||
if smooth:
|
||||
precisions[i] = (matches_by_order[i] + 1.0) / (
|
||||
possible_matches_by_order[i] + 1.0
|
||||
)
|
||||
else:
|
||||
if possible_matches_by_order[i] > 0:
|
||||
precisions[i] = (
|
||||
float(matches_by_order[i]) / possible_matches_by_order[i]
|
||||
)
|
||||
else:
|
||||
precisions[i] = 0.0
|
||||
|
||||
if min(precisions) > 0:
|
||||
p_log_sum = sum((1.0 / max_order) * math.log(p) for p in precisions)
|
||||
geo_mean = math.exp(p_log_sum)
|
||||
else:
|
||||
geo_mean = 0
|
||||
|
||||
ratio = float(translation_length) / reference_length
|
||||
|
||||
if ratio > 1.0:
|
||||
bp = 1.0
|
||||
else:
|
||||
bp = math.exp(1 - 1.0 / ratio)
|
||||
bleu = geo_mean * bp
|
||||
bleu_score_dict = {
|
||||
"bleu": bleu,
|
||||
"precision": precisions,
|
||||
"bp": bp,
|
||||
"ratio": ratio,
|
||||
"trans_len": translation_length,
|
||||
"ref_len": reference_length,
|
||||
}
|
||||
return bleu_score_dict # (bleu, precisions, bp, ratio, translation_length, reference_length)
|
||||
|
||||
|
||||
def bleu_test_case():
|
||||
"""A simple functionality test case to evaluate BLEU"""
|
||||
generated = [[["a", "=", "b", "\n", "y", "=", "a", "+", "1"]]]
|
||||
reference = [["a", "=", "b", "\n", "print", "a"]]
|
||||
score_dict = compute_bleu(generated, reference, smooth=False)
|
||||
return score_dict
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
score_dict = bleu_test_case()
|
||||
print(score_dict)
|
46
scripts/evaluation/metrics/extrinsic_eval.py
Normal file
46
scripts/evaluation/metrics/extrinsic_eval.py
Normal file
@ -0,0 +1,46 @@
|
||||
from metrics.bleu import compute_bleu
|
||||
|
||||
|
||||
def compute_exact_match(references, generated) -> float:
|
||||
"""
|
||||
Computes Exact Match Accuracy.
|
||||
args:
|
||||
reference: list of lists of references for each translation. Each
|
||||
reference should be tokenized into a list of tokens.
|
||||
translation: list of translations to score. Each translation
|
||||
should be tokenized into a list of tokens.
|
||||
returns:
|
||||
exact_match_accuracy : Float
|
||||
"""
|
||||
assert (
|
||||
len(references[0]) == len(generated),
|
||||
"Number of Samples should be equal in References and Synthesized Outputs..",
|
||||
)
|
||||
exact_match_count = 0.0
|
||||
for gen, ref in zip(generated, references[0]):
|
||||
if gen == ref:
|
||||
exact_match_count += 1
|
||||
exact_match_acc = exact_match_count / len(generated)
|
||||
return exact_match_acc
|
||||
|
||||
|
||||
def compute_metrics(references, generated) -> dict:
|
||||
"""
|
||||
Calculates various metrics and returns the calculated dict of these matrics.
|
||||
args:
|
||||
reference: list of lists of references for each translation. Each
|
||||
reference should be tokenized into a list of tokens.
|
||||
translation: list of translations to score. Each translation
|
||||
should be tokenized into a list of tokens.
|
||||
returns:
|
||||
A dicitonary with different metrics intact.
|
||||
"""
|
||||
metrics_dict = {
|
||||
"smoothed_bleu_4": None,
|
||||
"bleu_4": None,
|
||||
"exact_match_acc": None,
|
||||
} # Update as in new metrics are computed.
|
||||
metrics_dict["smoothed_bleu_4"] = compute_bleu(references, generated, smooth=True)
|
||||
metrics_dict["bleu_4"] = compute_bleu(references, generated, smooth=False)
|
||||
metrics_dict["exact_match_acc"] = compute_exact_match(references, generated)
|
||||
return metrics_dict
|
31
scripts/get_license_info.py
Normal file
31
scripts/get_license_info.py
Normal file
@ -0,0 +1,31 @@
|
||||
import os
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from fastcore.script import *
|
||||
from ghapi.all import GhApi
|
||||
|
||||
GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN")
|
||||
|
||||
|
||||
# Open issue on repo using custom title and body
|
||||
def get_license_info(owner, repo):
|
||||
api = GhApi(owner=owner, repo=repo, token=GITHUB_TOKEN)
|
||||
license = api.licenses.get_for_repo(owner=owner, repo=repo)
|
||||
return license.license.name
|
||||
|
||||
@call_parse
|
||||
def main(repos_path: Param("Path to the csv containing all of the repos", str)):
|
||||
"""
|
||||
Use pandas dataframe from the repos path to open issues in each of them.
|
||||
"""
|
||||
repos_path = Path(repos_path)
|
||||
df = pd.read_csv(repos_path)
|
||||
|
||||
# Loop through repos and get their license
|
||||
licenses = []
|
||||
for _, row in df.iterrows():
|
||||
owner, repo = row["name"].split("/")
|
||||
licenses.append(get_license_info(owner, repo))
|
||||
df["license"] = licenses
|
||||
df.to_csv(repos_path.parent/f"{repos_path.stem}_with_license.csv", index=False)
|
Loading…
Reference in New Issue
Block a user