Merge branch 'evaluation' of https://github.com/ncoop57/gpt-code-clippy into evaluation

adding codebleu evaluation
2024-08-16 10:20:28 +03:00 · 2021-07-15 08:58:28 +00:00 · 2021-07-15 08:58:28 +00:00 · 39ded45239
commit 39ded45239
parent 8bb263223a 22fcb3c5de
24 changed files with 2284 additions and 122 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/.gitmodules
+++ b/.gitmodules
@ -1,3 +1,12 @@
 [submodule "dependency_repos/github-downloader"]
 	path = dependency_repos/github-downloader
 	url = https://github.com/EleutherAI/github-downloader
+[submodule "dependency_repos/apps"]
+	path = dependency_repos/apps
+	url = https://github.com/hendrycks/apps.git
+[submodule "dependency_repos/human-eval"]
+	path = dependency_repos/human-eval
+	url = https://github.com/openai/human-eval
+[submodule "dependency_repos/CodeXGLUE"]
+	path = dependency_repos/CodeXGLUE
+	url = https://github.com/microsoft/CodeXGLUE
--- a/dependency_repos/CodeXGLUE
+++ b/dependency_repos/CodeXGLUE
@ -0,0 +1 @@
+Subproject commit 3e7bfe6dc4a88534c7803ce1bd8d1733c1d16888
--- a/dependency_repos/apps
+++ b/dependency_repos/apps
@ -0,0 +1 @@
+Subproject commit f834ca7d7405935376aabb5830edd0c42635824e
--- a/dependency_repos/human-eval
+++ b/dependency_repos/human-eval
@ -0,0 +1 @@
+Subproject commit 463c980b59e818ace59f6f9803cd92c749ceae61
--- a/metrics/.DS_Store
+++ b/metrics/.DS_Store
--- a/metrics/bleu.py
+++ b/metrics/bleu.py
@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# Took the following from CodeXGlue Repository - https://github.com/microsoft/CodeXGLUE/blob/main/Code-Code/code-to-code-trans/evaluator/CodeBLEU/bleu.py
+# The following code is taken from CodeXGlue Repository - https://github.com/microsoft/CodeXGLUE/blob/main/Code-Code/code-to-code-trans/evaluator/CodeBLEU/bleu.py


 """Python implementation of BLEU and smooth-BLEU.
@ -28,98 +28,106 @@ import math


 def _get_ngrams(segment, max_order):
-  """Extracts all n-grams upto a given maximum order from an input segment.
+    """Extracts all n-grams upto a given maximum order from an input segment.

-  Args:
-    segment: text segment from which n-grams will be extracted.
-    max_order: maximum length in tokens of the n-grams returned by this
-        methods.
+    Args:
+      segment: text segment from which n-grams will be extracted.
+      max_order: maximum length in tokens of the n-grams returned by this
+          methods.

-  Returns:
-    The Counter containing all n-grams upto max_order in segment
-    with a count of how many times each n-gram occurred.
-  """
-  ngram_counts = collections.Counter()
-  for order in range(1, max_order + 1):
-    for i in range(0, len(segment) - order + 1):
-      ngram = tuple(segment[i:i+order])
-      ngram_counts[ngram] += 1
-  return ngram_counts
+    Returns:
+      The Counter containing all n-grams upto max_order in segment
+      with a count of how many times each n-gram occurred.
+    """
+    ngram_counts = collections.Counter()
+    for order in range(1, max_order + 1):
+        for i in range(0, len(segment) - order + 1):
+            ngram = tuple(segment[i : i + order])
+            ngram_counts[ngram] += 1
+    return ngram_counts


-def compute_bleu(reference_corpus, translation_corpus, max_order=4,
-                 smooth=True):
-  """Computes BLEU score of translated segments against one or more references.
+def compute_bleu(reference_corpus, translation_corpus, max_order=4, smooth=True):
+    """Computes BLEU score of translated segments against one or more references.

-  Args:
-    reference_corpus: list of lists of references for each translation. Each
-        reference should be tokenized into a list of tokens.
-    translation_corpus: list of translations to score. Each translation
-        should be tokenized into a list of tokens.
-    max_order: Maximum n-gram order to use when computing BLEU score.
-    smooth: Whether or not to apply Lin et al. 2004 smoothing.
+    Args:
+      reference_corpus: list of lists of references for each translation. Each
+          reference should be tokenized into a list of tokens.
+      translation_corpus: list of translations to score. Each translation
+          should be tokenized into a list of tokens.
+      max_order: Maximum n-gram order to use when computing BLEU score.
+      smooth: Whether or not to apply Lin et al. 2004 smoothing.

-  Returns:
-    3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
-    precisions and brevity penalty.
-  """
-  matches_by_order = [0] * max_order
-  possible_matches_by_order = [0] * max_order
-  reference_length = 0
-  translation_length = 0
-  for (references, translation) in zip(reference_corpus,
-                                       translation_corpus):
-    reference_length += min(len(r) for r in references)
-    translation_length += len(translation)
+    Returns:
+      3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
+      precisions and brevity penalty.
+    """
+    matches_by_order = [0] * max_order
+    possible_matches_by_order = [0] * max_order
+    reference_length = 0
+    translation_length = 0
+    for (references, translation) in zip(reference_corpus, translation_corpus):
+        reference_length += min(len(r) for r in references)
+        translation_length += len(translation)

-    merged_ref_ngram_counts = collections.Counter()
-    for reference in references:
-      merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
-    translation_ngram_counts = _get_ngrams(translation, max_order)
-    overlap = translation_ngram_counts & merged_ref_ngram_counts
-    for ngram in overlap:
-      matches_by_order[len(ngram)-1] += overlap[ngram]
-    for order in range(1, max_order+1):
-      possible_matches = len(translation) - order + 1
-      if possible_matches > 0:
-        possible_matches_by_order[order-1] += possible_matches
+        merged_ref_ngram_counts = collections.Counter()
+        for reference in references:
+            merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
+        translation_ngram_counts = _get_ngrams(translation, max_order)
+        overlap = translation_ngram_counts & merged_ref_ngram_counts
+        for ngram in overlap:
+            matches_by_order[len(ngram) - 1] += overlap[ngram]
+        for order in range(1, max_order + 1):
+            possible_matches = len(translation) - order + 1
+            if possible_matches > 0:
+                possible_matches_by_order[order - 1] += possible_matches

-  precisions = [0] * max_order
-  for i in range(0, max_order):
-    if smooth:
-      precisions[i] = ((matches_by_order[i] + 1.) /
-                       (possible_matches_by_order[i] + 1.))
+    precisions = [0] * max_order
+    for i in range(0, max_order):
+        if smooth:
+            precisions[i] = (matches_by_order[i] + 1.0) / (
+                possible_matches_by_order[i] + 1.0
+            )
+        else:
+            if possible_matches_by_order[i] > 0:
+                precisions[i] = (
+                    float(matches_by_order[i]) / possible_matches_by_order[i]
+                )
+            else:
+                precisions[i] = 0.0
+
+    if min(precisions) > 0:
+        p_log_sum = sum((1.0 / max_order) * math.log(p) for p in precisions)
+        geo_mean = math.exp(p_log_sum)
    else:
-      if possible_matches_by_order[i] > 0:
-        precisions[i] = (float(matches_by_order[i]) /
-                         possible_matches_by_order[i])
-      else:
-        precisions[i] = 0.0
+        geo_mean = 0

-  if min(precisions) > 0:
-    p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
-    geo_mean = math.exp(p_log_sum)
-  else:
-    geo_mean = 0
+    ratio = float(translation_length) / reference_length

-  ratio = float(translation_length) / reference_length
+    if ratio > 1.0:
+        bp = 1.0
+    else:
+        bp = math.exp(1 - 1.0 / ratio)
+    bleu = geo_mean * bp
+    bleu_score_dict = {
+        "bleu": bleu,
+        "precision": precisions,
+        "bp": bp,
+        "ratio": ratio,
+        "trans_len": translation_length,
+        "ref_len": reference_length,
+    }
+    return bleu_score_dict  # (bleu, precisions, bp, ratio, translation_length, reference_length)

-  if ratio > 1.0:
-    bp = 1.
-  else:
-    bp = math.exp(1 - 1. / ratio)
-  bleu = geo_mean * bp
-  print(geo_mean)
-  bleu_score_dict = {"bleu":bleu,"precision":precisions,"bp":bp,"ratio":ratio,"trans_len":translation_length,"ref_len":reference_length}
-  return bleu_score_dict#(bleu, precisions, bp, ratio, translation_length, reference_length)

 def bleu_test_case():
    """A simple functionality test case to evaluate BLEU"""
-    generated = [[["a","=","b","\n","y","=","a","+","1"]]]
-    reference = [["a","=","b","\n","print","a"]]
-    score_dict = compute_bleu(generated,reference,smooth=False)
+    generated = [[["a", "=", "b", "\n", "y", "=", "a", "+", "1"]]]
+    reference = [["a", "=", "b", "\n", "print", "a"]]
+    score_dict = compute_bleu(generated, reference, smooth=False)
    return score_dict

+
 if __name__ == "__main__":
    score_dict = bleu_test_case()
-    print(score_dict)
+    print(score_dict)
--- a/metrics/extrinsic_eval.py
+++ b/metrics/extrinsic_eval.py
@ -1,6 +1,10 @@
 from metrics.bleu import compute_bleu
+from metrics.parse_check import check_parse

-def compute_metrics(references,generated) -> dict:
+Parser = check_parse()  # Initializing parser
+
+
+def compute_metrics(references, generated, lang) -> dict:
    """
    Calculates various metrics and returns the calculated dict of these matrics.
    args:
@ -8,11 +12,12 @@ def compute_metrics(references,generated) -> dict:
          reference should be tokenized into a list of tokens.
        translation: list of translations to score. Each translation
          should be tokenized into a list of tokens.
+        lang(str) : The language generated code belongs to
    returns:
        A dicitonary with different metrics intact.
    """
-    metrics_dict = {} #Update as in new metrics are added over here.
-    metrics_dict["smoothed_bleu_4"] = compute_bleu(references,generated,smooth=True)
-    metrics_dict["bleu_4"] = compute_bleu(references,generated,smooth=False)
-    
-    return metrics_dict
+    metrics_dict = {}  # Update as in new metrics are added over here.
+    metrics_dict["smoothed_bleu_4"] = compute_bleu(references, generated, smooth=True)
+    metrics_dict["bleu_4"] = compute_bleu(references, generated, smooth=False)
+    metrics_dict["parse_score"] = Parser(generated, lang)["parse_score"]
+    return metrics_dict
--- a/metrics/parse_check.py
+++ b/metrics/parse_check.py
@ -0,0 +1,53 @@
+from tree_sitter  import Language, Parser
+
+def load_tree_sitter_languages():
+    """Loads language Grammars to evaluate"""
+    py_parser = Parser()
+    py_parser.set_language(Language('./tree_sitter_utils/build/my-languages.so', 'python'))
+    js_parser = Parser()
+    js_parser.set_language(Language('./tree_sitter_utils/build/my-languages.so', 'javascript'))
+    cpp_parser = Parser()
+    cpp_parser.set_language(Language('./tree_sitter_utils/build/my-languages.so', 'cpp'))
+    go_parser  = Parser()
+    go_parser.set_language(Language('./tree_sitter_utils/build/my-languages.so', 'go'))
+    java_parser = Parser()
+    java_parser.set_language(Language('./tree_sitter_utils/build/my-languages.so', 'java'))
+    return {
+        "py"  : py_parser,
+        "js"  : js_parser,
+        "cpp" : cpp_parser,
+        "go"  : go_parser,
+        "java": java_parser
+    }
+
+class check_parse:
+    def __init__(self):
+        self.language_dict = load_tree_sitter_languages()
+    def __call__(self,batch,lang):
+        """
+        args:
+            batch : list[str] of code generated by the model
+            lang  : lang should be one of the above language_dict keys
+        
+        returns: 
+            dict(
+            parse_score = averaged out score on how many datapoints are parsed
+            index_parse = check if corresponding index is parsed
+            )
+        """
+        cumulative_parse_score = 0
+        index_parse_list = []
+        parser = self.language_dict[lang]
+        for inp in batch:
+            parsed = parser.parse(bytes(inp,"utf-8"))
+            inp_ind_score = int("ERROR" not in parsed.root_node.sexp())
+            cumulative_parse_score+=inp_ind_score
+            index_parse_list.append(inp_ind_score)
+        return {"parse_score":cumulative_parse_score,"index_parse":index_parse_list}
+if __name__ == "__main__":
+    Parse = check_parse()
+    score = Parse(["""
+def a():
+    if bar:
+        baz()"""],"py")
+    print(score)
--- a/metrics/tree_sitter_utils/.DS_Store
+++ b/metrics/tree_sitter_utils/.DS_Store
--- a/nbs/data_processing.ipynb
+++ b/nbs/data_processing.ipynb
--- a/scripts/evaluation/apps_utils/generate_gpt_codes.py
+++ b/scripts/evaluation/apps_utils/generate_gpt_codes.py
@ -0,0 +1,102 @@
+# MIT License
+
+# Copyright (c) 2021 Dan Hendrycks and contributors.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+"""
+Run a tranined model to generate Python code.
+"""
+
+import io
+import json
+import logging
+import math
+import random
+import numpy as np
+import os
+import pprint
+import sys
+import time
+import transformers
+import torch
+
+from apps_utils.reindent import run as run_reindent
+
+# for timing and debugging
+from datetime import datetime, date
+from tqdm import tqdm
+
+
+def reindent_code(codestr):
+    """
+    Given code string, reindent it in the same way that the
+    Github dataset was indented
+    """
+    codestr = io.StringIO(codestr)
+    ret = io.StringIO()
+
+    run_reindent(
+        codestr,
+        ret,
+        config={
+            "dry-run": False,
+            "help": False,
+            "to": 10,
+            "from": -1,
+            "tabs": True,
+            "encoding": "utf-8",
+            "is-tabs": False,
+            "tabsize": 10,
+            "all-tabs": False,
+        },
+    )
+
+    return ret.getvalue()
+
+
+def generate_prompt(
+    test_case_path, prompt_path, solutions_path, tokenizer, starter_path=None
+):
+    _input = "\nQUESTION:\n"
+    with open(prompt_path, "r") as f:
+        data = f.readlines()
+        data = "".join(data)
+    _input += data
+    if starter_path != None:
+        with open(starter_path, "r") as f:
+            data = f.readlines()
+            data = "".join(data)
+            data = "\n" + data  # + "\n"
+        _input += data
+    else:
+        # _input += "\n\n"
+        pass
+
+    with open(test_case_path, "r") as f:
+        data = json.load(f)
+    if not data.get("fn_name"):
+        _input += "\nUse Standard Input format"  # \n"
+    else:
+        _input += "\nUse Call-Based format"  # \n"
+
+    _input += "\nANSWER:\n"
+
+    return _input
--- a/scripts/evaluation/apps_utils/reindent.py
+++ b/scripts/evaluation/apps_utils/reindent.py
@ -0,0 +1,227 @@
+# MIT License
+
+# Copyright (c) 2021 Dan Hendrycks and contributors.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+"""
+Reindent files.
+"""
+
+from __future__ import print_function
+import sys
+import getopt
+import codecs
+import tempfile
+import shutil
+import os
+
+
+def _find_indentation(line, config):
+    if len(line) and line[0] in (" ", "\t") and not line.isspace():
+        if line[0] == "\t":
+            config['is-tabs'] = True
+        # Find indentation
+        i = 0
+        for char in list(line):
+            if char not in (" ", "\t"):
+                break
+            i += 1
+        config["from"] = i
+
+
+def find_indentation(line, config):
+    # Find indentation level used in file
+    if config['from'] < 0:
+        _find_indentation(line, config)
+
+    if config['from'] >= 0:
+        # Set old indent
+        indent = " " if not config['is-tabs'] else "\t"
+        indent = indent * config['from']
+
+        # Set new indent
+        newindent = " " if not config['tabs'] else "\t"
+        if not config['tabs']:
+            newindent = newindent * config['to']
+
+        return indent, newindent
+
+    # Continue to the next line, indentation not found
+    return False
+
+
+def replace_inline_tabs(content, config):
+    newcontent = ""
+    imagined_i = 0
+    for i in range(0, len(content)):
+        char = content[i]
+        if char == '\t':
+            spaces = config['tabsize']-(imagined_i % config['tabsize'])
+            newcontent += " " * spaces
+            imagined_i += spaces
+        else:
+            newcontent += char
+            imagined_i += 1
+    return newcontent
+
+
+def run(fd_in, fd_out, config):
+    from reindent_4_spaces import Reindenter
+    import io
+    
+    inter = io.StringIO()
+    ri = Reindenter(fd_in)
+    ri.run()
+    ri.write(inter)
+    fd_in = inter
+    fd_in.seek(0)
+
+    while True:
+        line = fd_in.readline()
+        if not line:
+            break
+        line = line.rstrip('\r\n')
+
+        # Find indentation style used in file if not set
+        if config['from'] < 0:
+            indent = find_indentation(line, config)
+            if not indent:
+                print(line, file=fd_out)
+                continue
+            indent, newindent = indent
+
+        # Find current indentation level
+        level = 0
+        while True:
+            whitespace = line[:len(indent) * (level + 1)]
+            if whitespace == indent * (level + 1):
+                level += 1
+            else:
+                break
+
+        content = line[len(indent) * level:]
+        if config['all-tabs']:
+            content = replace_inline_tabs(content, config)
+
+        line = (newindent * level) + content
+        print(line, file=fd_out)
+        # print(config)
+
+
+def run_files(filenames, config):
+    for filename in filenames:
+        with codecs.open(filename, encoding=config['encoding']) as fd_in:
+            if config['dry-run']:
+                print("Filename: %s" % filename)
+                fd_out = sys.stdout
+            else:
+                fd_out = tempfile.NamedTemporaryFile(mode='wb', delete=False)
+                fd_out.close()
+                fd_out = codecs.open(fd_out.name, "wb", encoding=config['encoding'])
+
+            run(fd_in, fd_out, config)
+
+            if not config["dry-run"]:
+                fd_out.close()
+                shutil.copy(fd_out.name, filename)
+                os.remove(fd_out.name)
+
+
+def main(args):
+    config = {
+        "dry-run": False,
+        "help": False,
+        "to": 4,
+        "from": -1,
+        "tabs": False,
+        "encoding": "utf-8",
+        "is-tabs": False,
+        "tabsize": 4,
+        "all-tabs": False
+    }
+    possible_args = {
+        "d":  "dry-run",
+        "h":  "help",
+        "t:": "to=",
+        "f:": "from=",
+        "n":  "tabs",
+        "e:": "encoding=",
+        "s:": "tabsize=",
+        "a":  "all-tabs",
+    }
+    optlist, filenames = getopt.getopt(
+        args[1:],
+        "".join(possible_args.keys()),
+        possible_args.values()
+    )
+
+    shortargs, longargs = [], []
+    for shortarg in possible_args:
+        shortargs.append(shortarg.rstrip(":"))
+        longargs.append(possible_args[shortarg].rstrip("="))
+
+    for opt, val in optlist:
+        opt = opt.lstrip("-")
+        if opt in shortargs:
+            opt = longargs[shortargs.index(opt)]
+        if isinstance(config[opt], bool):
+            config[opt] = True
+        elif isinstance(config[opt], int):
+            config[opt] = int(val)
+        else:
+            config[opt] = val
+
+    if config['help']:
+        help = """
+        Usage: %s [options] filename(s)
+        Options:
+            -h, --help              Show this message
+            -d, --dry-run           Don't save anything, just print
+                                    the result
+            -t <n>, --to <n>        Convert to this number of spaces
+                                    (default: 4)
+            -f <n>, --from <n>      Convert from this number of spaces
+                                    (default: auto-detect, will also
+                                    detect tabs)
+            -n, --tabs              Don't convert indentation to spaces,
+                                    convert to tabs instead. -t and
+                                    --to will have no effect.
+            -a, --all-tabs          Also convert tabs used for alignment
+                                    in the code (Warning: will replace
+                                    all tabs in the file, even if inside
+                                    a string)
+            -s <n>, --tabsize <n>   Set how many spaces one tab is
+                                    (only has an effect on -a, default: 4)
+            -e <s>, --encoding <s>  Open files with specified encoding
+                                    (default: utf-8)
+        """ % args[0]
+
+        # Also removes 8 leading spaces to remove our indentation
+        print("\n".join([x[8:] for x in help[1:].split("\n")]))
+        sys.exit(0)
+
+    if filenames:
+        run_files(filenames, config)
+    else:
+        run(sys.stdin, sys.stdout, config)
+
+if __name__ == "__main__":
+    main(sys.argv)
--- a/scripts/evaluation/apps_utils/test_one_solution.py
+++ b/scripts/evaluation/apps_utils/test_one_solution.py
@ -0,0 +1,158 @@
+# MIT License
+
+# Copyright (c) 2021 Dan Hendrycks and contributors.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""
+Run solutions from one problem.
+"""
+
+import io
+import json
+import logging
+import math
+import numpy as np
+import os
+import pprint
+import sys
+import apps_utils.testing_util as test_util
+import time
+
+# for timing debugging
+from datetime import datetime, date
+from pathlib import Path
+from tqdm import tqdm
+
+from typing import List
+
+
+def print_results(results, args):
+    res = []
+    per_prob_res = []
+    all_correct = []
+    for index in results:
+        res.extend(results[index])
+        per_prob_res.append(np.mean(results[index]))
+        all_correct.append(np.all(results[index]))
+    tmp_results = res
+    compile_errors = len(tmp_results[tmp_results == -2])
+    runtime_errors = len(tmp_results[tmp_results == -1])
+    failures = len(tmp_results[tmp_results == False])
+    successes = len(tmp_results[tmp_results == True])
+    total_testcases = len(res)
+    if args.debug:
+        print(
+            f"number of compile errors = {compile_errors} avg = {compile_errors / total_testcases }"
+        )
+        print(
+            f"number of runtime errors = {runtime_errors} avg = {runtime_errors / total_testcases}"
+        )
+        print(f"number of test cases run = {total_testcases}")
+
+    print(
+        f"Test Case Average (average accuracy over problems) = {np.mean(per_prob_res)}"
+    )
+    print(
+        f"Strict Accuracy (all test cases passed / total problems) = {np.mean(all_correct)}"
+    )
+
+
+def eval_and_save_problems(test_loc, save):
+    test_path = Path(test_loc)
+    problems = list(test_path.glob("*/"))
+
+    print(len(problems))
+    gpt_codes = {}
+    gpt_bleu = {}
+    gpt_codebleu = {}
+    results = {}
+    codes_loc = os.path.join(save, f"all_codes.json")
+    # if not os.path.exists(codes_loc):
+    #     codes_loc = os.path.join(args.save, f"{args.start}-{args.end}_codes.json")
+
+    if os.path.exists(codes_loc):
+        results_loc = os.path.join(save, f"all_results.json")
+    print(codes_loc, results_loc)
+
+    with open(codes_loc, "r") as f:
+        gpt_codes = json.load(f)
+
+    # main eval loop
+    for index, problem in enumerate(tqdm(problems[:2])):
+        try:
+            # if args.debug:
+            #     print(f"\n\nproblem path = {problem}")
+            output_str = gpt_codes[str(index)]
+        except:
+            print("CANNOT FIND OUTPUT_STR FOR", problem)
+            continue
+        prob_path = problem  # os.path.join(args.root, problem)
+
+        # with open(os.path.join(prob_path, "solutions.json"), "r") as f:
+        #     sols = json.load(f)
+
+        if not os.path.exists(save):
+            os.makedirs(save)
+
+        res = []
+        # for o_idx, o in enumerate(output_str):
+        #     print(o)
+        # if args.debug:
+        #     print(f"\nTesting solution {o_idx}")
+        curr_res = [-2]
+        try:
+            curr_res = test_util.run_test(
+                prob_path=prob_path, test=output_str, debug=False  # args.debug
+            )
+            fixed = []
+            for e in curr_res:
+                if isinstance(e, np.ndarray):
+                    e = e.item(0)
+                if isinstance(e, np.bool_):
+                    e = bool(e)
+                fixed.append(e)
+            curr_res = fixed
+            if not np.all(curr_res):
+                print(f"Results were not all True: {curr_res}")
+        except Exception as e:
+            print(f"test framework exception = {repr(e)}{e}\n")
+            break
+        finally:
+            assert isinstance(curr_res, list)
+            res.append(curr_res)
+
+        # if args.debug:
+        #     print(
+        #         f"\nHow to read results [-2] = compile error, [-1] = runtime error [False] = failed test case [True] = passed test case"
+        #     )
+        # print(f"results = {res}")
+
+        results[index] = res
+
+        with open(results_loc, "w") as f:
+            try:
+                f.write(json.dumps(results))
+            except Exception as e:
+                import pdb
+
+                pdb.set_trace()
+                print("didn't save problem due to {e}")
+
+    return results
--- a/scripts/evaluation/apps_utils/testing_util.py
+++ b/scripts/evaluation/apps_utils/testing_util.py
@ -0,0 +1,544 @@
+import argparse
+import json
+import os
+import sys
+import io
+import faulthandler
+
+# used for debugging to time steps
+from datetime import datetime
+
+# to run the solution files we're using a timing based approach
+import signal
+
+import numpy as np
+# for capturing the stdout
+from io import StringIO
+from typing import get_type_hints
+from typing import List, Tuple
+# used for testing the code that reads from input
+from unittest.mock import patch, mock_open
+
+from pyext import RuntimeModule
+
+from enum import Enum
+class CODE_TYPE(Enum):
+    call_based = 0
+    standard_input = 1
+
+# stuff for setting up signal timer
+class TimeoutException(Exception):
+    pass
+def timeout_handler(signum, frame):
+    print("alarm went off")
+    #return
+    raise TimeoutException
+signal.signal(signal.SIGALRM, timeout_handler)
+timeout = 4  # seconds
+
+# used to capture stdout as a list
+# from https://stackoverflow.com/a/16571630/6416660
+# alternative use redirect_stdout() from contextlib
+class Capturing(list):
+    def __enter__(self):
+        self._stdout = sys.stdout
+        sys.stdout = self._stringio = StringIO()
+        # Make closing the StringIO a no-op
+        self._stringio.close = lambda x: 1
+        return self
+    def __exit__(self, *args):
+        self.extend(self._stringio.getvalue().splitlines())
+        del self._stringio    # free up some memory
+        sys.stdout = self._stdout
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Utility for testing code generation.")
+    parser.add_argument("-v", "--verbosity-level", action="store", type=int,
+            help="")
+    parser.add_argument("-s", "--source",  type=str, default="leetcode", 
+            choices=["leetcode", "atcoder", "codewars",],
+            help="which data source to gather from.")
+    parser.add_argument("-d", "--data",  type=str, default="question",
+            choices=["question", "q", "solutions", "sol", "s", "starter", "tests", "t"],
+            help="which type of data to receive.")
+    parser.add_argument("-n", "--number",  type=int, default=0,
+            help="which problem to query.")
+
+    args = parser.parse_args()
+    return args
+
+
+def get_valid_problems(data_dir="leetcode"):
+    # these are unnecessary atm
+    if data_dir == "leetcode":
+        root = os.path.join(args.source, "data")
+    elif data_dir == "atcoder":
+        pass
+
+    root = os.path.join(data_dir, "data")
+    if os.path.exists(os.path.join(data_dir, "valid_problems.json")):
+        with open(os.path.join(data_dir, "valid_problems.json"), "r") as f:
+            return json.load(f)
+
+    # after we compute it once let's save it and load that instead
+    # TODO determine if might be better to reload each time
+    tmp = os.listdir(root)
+    valid_probs = []
+    for folder in tmp:
+        prob_path = os.path.join(root, folder)
+        files = os.listdir(prob_path)
+        #TODO add more validity checks
+        if "input_output.json" in files or "sols.json" in files:
+            valid_probs.append(prob_path)
+    valid_probs = sorted(valid_probs)
+    #with open(os.path.join(args.source,"valid_problems.json"), "w") as f:
+    #    json.dump(valid_probs, f)
+    return valid_probs
+
+
+def get_question(problem_list, prob_index):
+    root = problem_list[prob_index]
+    #print("get q", root)
+    if os.path.exists(os.path.join(root, "question.txt")):
+        with open(os.path.join(root, "question.txt")) as f:
+            question = f.readlines()
+    else:
+        print("question prompt not found")
+        question = ""
+    question = "".join(question)
+    return question
+
+
+def get_solutions(problem_list, prob_index):
+    root = problem_list[prob_index]
+    if os.path.exists(os.path.join(root, "solutions.json")):
+        with open(os.path.join(root, "solutions.json")) as f:
+            sols = json.load(f)
+    return sols
+
+
+def run_test(prob_path:str=None, problem_list:List[str]=None, prob_index:int=None, 
+        test:str=None, debug:bool=False):
+    """
+    if test is not None it'll try to run the code.
+    otherwise it'll just return an input and output pair.
+    """
+    if prob_path is None and problem_list is None:
+        print("please provide either prob_path or problem_list")
+        exit()
+
+    if debug:
+        print(f"start = {datetime.now().time()}")
+    if prob_path is not None:
+        root = prob_path
+    elif problem_list is not None:
+        root = problem_list[prob_index]
+
+    if os.path.exists(os.path.join(root, "input_output.json")):
+        with open(os.path.join(root, "input_output.json")) as f:
+            in_outs = json.load(f)
+            if debug:
+                print(f"test cases json = {in_outs['inputs']} {in_outs['outputs']}")
+            
+            if in_outs.get("fn_name") is None:
+                which_type = CODE_TYPE.standard_input  # Standard input
+                method_name = None
+            else:
+                which_type = CODE_TYPE.call_based  # Call-based
+                method_name = in_outs["fn_name"]
+    if debug:
+        print(f"loaded json = {datetime.now().time()}")
+ 
+    #else:
+    #    continue
+    if test is None:
+        return in_outs
+    elif test is not None:
+        results = []
+        sol = "import sys\nimport time\nimport itertools\nfrom itertools import accumulate, product, permutations, combinations\nimport collections\nfrom collections import Counter, OrderedDict, deque, defaultdict, ChainMap\nfrom functools import lru_cache\nimport math\nfrom math import sqrt, sin, cos, tan, ceil, fabs, floor, gcd, exp, log, log2\nimport fractions\nfrom typing import List, Tuple\nimport numpy as np\nimport random\nimport heapq\nfrom heapq import *\n"
+        if debug:
+            print(f"loading test code = {datetime.now().time()}")
+ 
+        if which_type == CODE_TYPE.call_based:
+            sol += test
+            if debug: # or True:
+                print(f"sol = {sol}")
+            signal.alarm(timeout)
+            try:
+                tmp_sol = RuntimeModule.from_string("tmp_sol", "", sol)
+                if "class Solution" not in test:
+                    tmp = tmp_sol
+                else:
+                    tmp = tmp_sol.Solution()
+                signal.alarm(0)
+            except Exception as e:
+                signal.alarm(0)
+                print(f"type 0 compilation error = {e}")
+                results.append(-2)
+                return results
+            signal.alarm(0)
+
+        elif which_type == CODE_TYPE.standard_input:
+            # sol
+            tmp_test = test.split("\n")
+
+            new_test = []
+            for x in tmp_test:
+                if (not x.startswith("from ")) and (not x.startswith("import ")):
+                    new_test.append("\t" + x + "\n")
+                else:
+                    new_test.append(x + "\n")
+            tmp_test = new_test
+            
+            new_test = ""
+            started = False
+            for i in tmp_test:
+                if i.startswith("\t") and not started:
+                    new_test += "stdin = sys.stdin\nstdout = sys.stdout\n"
+                    new_test += "def code():\n"
+                    new_test += i
+                    started = True
+                elif started and ((i.startswith("from ")) or (i.startswith("import "))): 
+                    new_test += "\t" + i
+                else:
+                    new_test += i
+            tmp_test = new_test
+
+            sol += tmp_test
+            if debug:
+                print(f"sol = {sol}")
+                # print(f"{o}") 
+            method_name = "code"
+            signal.alarm(timeout)
+            try:
+                tmp_sol = RuntimeModule.from_string("tmp_sol", "", sol)
+                tmp = tmp_sol
+                signal.alarm(0)
+            except Exception as e:
+                signal.alarm(0)
+                print(f"type 1 compilation error = {e}")
+                results.append(-2)
+                return results
+            signal.alarm(0)
+        if debug:
+            print(f"get method = {datetime.now().time()}")
+ 
+        try:
+            method = getattr(tmp, method_name)  # get_attr second arg must be str
+        except:
+            signal.alarm(0)
+            e = sys.exc_info()
+            print(f"unable to get function error = {e}")
+            return results
+
+        for index, inputs in enumerate(in_outs["inputs"]):
+            # JSON forces dictionaries to have string keys; this undoes this (assuming a singleton list)
+            try:
+                if isinstance(inputs[0], dict):
+                    inputs = [{int(k): v for k,v in inputs[0].items()}]
+            except:
+                True
+            try:
+                if isinstance(in_outs["outputs"][index], dict):
+                    in_outs["outputs"][index] = [{int(k): v for k,v in in_outs["outputs"][index].items()}]
+            except:
+                True
+            try:
+                if isinstance(in_outs["outputs"][index][0], dict):
+                    in_outs["outputs"][index] = [{int(k): v for k,v in in_outs["outputs"][index][0].items()}]
+            except:
+                True
+
+            if debug:
+                print(f"time: {datetime.now().time()} testing index = {index}  inputs = {inputs}, {type(inputs)}. type = {which_type}")
+            if which_type == CODE_TYPE.call_based:  # Call-based
+                signal.alarm(timeout)
+                faulthandler.enable()
+                try:
+                    # print("------------")
+                    # print(inputs)
+                    output = method(*inputs)
+
+                    # ground truth sequences are not tuples
+                    if isinstance(output, tuple):
+                        output = list(output)
+                    
+                    tmp_result = output == in_outs["outputs"][index]
+                    if isinstance(in_outs["outputs"][index], list) and in_outs["outputs"][index]:
+                        tmp_result = tmp_result or (output == in_outs["outputs"][index][0])
+
+                    # ground truth sequences are not tuples
+                    try:
+                        if isinstance(output[0], tuple):
+                            tmp_result = tmp_result or ([list(x) for x in output] == in_outs["outputs"][index][0])
+                    except:
+                        True
+                    results.append(tmp_result)
+
+                    # reset the alarm
+                    signal.alarm(0)
+                except Exception as e:
+                    signal.alarm(0)
+                    faulthandler.disable()
+                    print(f"Standard input runtime error or time limit exceeded error = {e}")
+                    results.append(-1)
+                    continue
+                faulthandler.disable()
+                signal.alarm(0)
+                if debug:
+                    print(f"outputs = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
+            elif which_type == CODE_TYPE.standard_input:  # Standard input
+                faulthandler.enable()
+                signal.alarm(timeout)
+                passed = False
+
+                if isinstance(inputs, list):
+                    inputs = "\n".join(inputs)
+                if isinstance(in_outs['outputs'][index], list):
+                    in_outs['outputs'][index] = "\n".join(in_outs['outputs'][index])
+
+                with Capturing() as output:
+                    try:
+                        call_method(method, inputs)
+                        # reset the alarm
+                        signal.alarm(0)
+                        passed = True
+                    except Exception as e:
+                        # runtime error or took too long
+                        signal.alarm(0)
+                        print(f"Call-based runtime error or time limit exceeded error = {repr(e)}{e}")
+                        results.append(-1)
+                    signal.alarm(0)
+
+                if not passed:
+                    if debug:
+                        nl = "\n"
+                        if not isinstance(inputs, list):
+                            print(f"not passed output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
+                        else:
+                            print(f"not passed output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
+                    continue
+
+                if passed and debug:
+                    print(f"==> output = {output}, test outputs = {in_outs['outputs'][index]}")
+
+                if custom_compare_(output, in_outs['outputs'][index]):
+                    tmp_result = True
+                    results.append(tmp_result)
+                    continue
+
+                # ground truth sequences are expressed as lists not tuples
+                if isinstance(output, tuple):
+                    output = list(output)
+
+                tmp_result = False
+                try:
+                    tmp_result = (output == [in_outs["outputs"][index]])
+                    if isinstance(in_outs["outputs"][index], list):
+                        tmp_result = tmp_result or (output == in_outs["outputs"][index])
+                        if isinstance(output[0], str):
+                            tmp_result = tmp_result or ([e.strip() for e in output] == in_outs["outputs"][index])
+                except Exception as e:
+                    print(f"Failed check1 exception = {e}")
+                    pass
+
+                if tmp_result == True:  
+                    results.append(tmp_result)
+                    continue
+
+                # try one more time without \n
+                if isinstance(in_outs["outputs"][index], list):
+                    for tmp_index, i in enumerate(in_outs["outputs"][index]):
+                        in_outs["outputs"][index][tmp_index] = i.split("\n")
+                        in_outs["outputs"][index][tmp_index] = [x.strip() for x in in_outs["outputs"][index][tmp_index] if x]
+                else:
+                    in_outs["outputs"][index] = in_outs["outputs"][index].split("\n")
+                    in_outs["outputs"][index] = list(filter(len, in_outs["outputs"][index]))
+                    in_outs["outputs"][index] = list(map(lambda x:x.strip(), in_outs["outputs"][index]))
+
+                try:
+                    tmp_result = (output == [in_outs["outputs"][index]])
+                    if isinstance(in_outs["outputs"][index], list):
+                        tmp_result = tmp_result or (output == in_outs["outputs"][index])
+                except Exception as e:
+                    print(f"Failed check2 exception = {e}")
+                    pass
+
+                if tmp_result == True:
+                    results.append(tmp_result)
+                    continue
+
+                # try by converting the output into a split up list too
+                if isinstance(output, list):
+                    output = list(filter(len, output))
+
+                if debug:
+                    nl = "\n"
+                    if not isinstance(inputs, list):
+                        print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}") 
+                    else:
+                        print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}") 
+                
+                if tmp_result == True:
+                    results.append(tmp_result)
+                    continue
+
+                try:
+                    tmp_result = (output == [in_outs["outputs"][index]])
+                    if isinstance(in_outs["outputs"][index], list):
+                        tmp_result = tmp_result or (output == in_outs["outputs"][index])
+                except Exception as e:
+                    print(f"Failed check3 exception = {e}")
+                    pass
+
+                try:
+                    output_float = [float(e) for e in output]
+                    gt_float = [float(e) for e in in_outs['outputs'][index]]
+                    tmp_result = tmp_result or ((len(output_float) == len(gt_float)) and np.allclose(output_float, gt_float))
+                except Exception as e:
+                    pass
+                try:
+                    if isinstance(output[0], list):
+                        output_float = [float(e) for e in output[0]]
+                        gt_float = [float(e) for e in in_outs['outputs'][index][0]]
+                        tmp_result = tmp_result or ((len(output_float) == len(gt_float)) and np.allclose(output_float, gt_float))
+                except Exception as e:
+                    pass
+
+                if tmp_result == True:
+                    results.append(tmp_result)
+                    continue
+
+                # try by converting the stuff into split up list
+                if isinstance(in_outs["outputs"][index], list):
+                    for tmp_index, i in enumerate(in_outs["outputs"][index]):
+                        in_outs["outputs"][index][tmp_index] = set(i.split())
+                else:
+                    in_outs["outputs"][index] = set(in_outs["outputs"][index].split())
+
+                try:
+                    tmp_result = (output == in_outs["outputs"][index])
+                except Exception as e:
+                    print(f"Failed check4 exception = {e}")
+                    continue
+
+                if tmp_result == True:
+                    results.append(tmp_result)
+                    continue 
+
+                # try by converting the output into a split up list too
+                if isinstance(output, list):
+                    for tmp_index, i in enumerate(output):
+                        output[tmp_index] = i.split()
+                    output = list(filter(len, output))
+                    for tmp_index, i in enumerate(output):
+                        output[tmp_index] = set(i)    
+                else:
+                    output = output.split()
+                    output = list(filter(len, output))
+                    output = set(output)
+
+                try:
+                    tmp_result = (set(frozenset(s) for s in output) == set(frozenset(s) for s in in_outs["outputs"][index]))
+                except Exception as e:
+                    print(f"Failed check5 exception = {e}")
+
+
+                # if they are all numbers, round so that similar numbers are treated as identical
+                try:
+                    tmp_result = tmp_result or (set(frozenset(round(float(t),3) for t in s) for s in output) ==\
+                        set(frozenset(round(float(t),3) for t in s) for s in in_outs["outputs"][index]))
+                except Exception as e:
+                    print(f"Failed check6 exception = {e}")
+                
+                if tmp_result == True and debug:
+                    print("PASSED")
+ 
+                results.append(tmp_result)
+                
+                if debug:
+                    nl = "\n"
+                    if not isinstance(inputs, list):
+                        print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
+                    else:
+                        print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}") 
+
+
+    return results
+
+def custom_compare_(output, ground_truth):
+    
+    if isinstance(output, list):
+        output_1 = "\n".join(output)
+        if stripped_string_compare(output_1, ground_truth):
+            return True
+
+    if isinstance(output, list):
+        output_2 = [o.lstrip().rstrip() for o in output]
+        output_2 = "\n".join(output_2)
+        if stripped_string_compare(output_2, ground_truth):
+            return True
+
+    return False
+
+def stripped_string_compare(s1, s2):
+    s1 = s1.lstrip().rstrip()
+    s2 = s2.lstrip().rstrip()
+    return s1 == s2
+
+def call_method(method, inputs):
+
+    if isinstance(inputs, list):
+        inputs = "\n".join(inputs)
+
+    inputs_line_iterator = iter(inputs.split("\n"))
+
+    # sys.setrecursionlimit(10000)
+
+    # @patch('builtins.input', side_effect=inputs.split("\n"))
+    @patch('builtins.open', mock_open(read_data=inputs))
+    @patch('sys.stdin', StringIO(inputs))
+    @patch('sys.stdin.readline', lambda *args: next(inputs_line_iterator))
+    @patch('sys.stdin.readlines', lambda *args: inputs.split("\n"))
+    @patch('sys.stdin.read', lambda *args: inputs)
+    # @patch('sys.stdout.write', print)
+    def _inner_call_method(_method):
+        try:
+            return _method()
+        except SystemExit as e:
+            pass
+        finally:
+            pass
+    return _inner_call_method(method) 
+
+def main(args):
+    print(args)
+    problem_list = sorted(get_valid_problems(args.source))
+    print(f"number of problems = {len(problem_list)}")
+    prob_index = args.number
+    print(f"problem is {problem_list[prob_index]}")
+
+    # This checks it correctly loaded. remove this later
+    assert prob_index < len(problem_list)
+
+    if args.data == "q" or args.data == "question":
+        tmp = get_question(problem_list, prob_index)
+        print("q", tmp)
+    elif args.data in ["solutions", "sol", "s",]:
+        tmp = get_solutions(problem_list, prob_index)
+        print("sol", tmp)
+    elif args.data == "starter":
+        tmp = get_starter(problem_list, prob_index)
+        print("starter", tmp)
+    elif args.data in ["test", "t"]:
+        # test it with sols
+        sols = get_solutions(problem_list, prob_index)
+        tmp = run_test(problem_list, prob_index, test=sols[0])
+
+        print("results = ", tmp)
+        print("-2 = compile error, -1 is runtime error, False failed test, True passed test")
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/scripts/evaluation/code_search_net.py
+++ b/scripts/evaluation/code_search_net.py
@ -0,0 +1,4 @@
+from datasets import load_dataset
+
+dataset = load_dataset("code_x_glue_ct_code_to_text", "go")
+print(dataset)
--- a/scripts/evaluation/concode.py
+++ b/scripts/evaluation/concode.py
@ -0,0 +1,23 @@
+import pandas as pd
+
+from datasets import load_dataset, load_metric
+from fastcore.script import *
+from pathlib import Path
+
+bleu = load_metric("sacrebleu")
+
+predictions = ["hello there kenobi", "foo bar foobar"]
+references = [
+    ["hello there general kenobi"],
+    ["foo bar foobar"],  # , "hello there !"],  # , "foo bar foobar"],
+]
+
+
+@call_parse
+def main(concode_path: Param("Path to the concode data in CodeXGLUE", str)):
+    concode_path = Path(concode_path)
+    dataset = load_dataset("json", data_files=str(concode_path / "test.json"))
+    print(dataset)
+    results = bleu.compute(predictions=predictions, references=references)
+    print(list(results.keys()))
+    print(round(results["score"], 1))
--- a/scripts/evaluation/evaluate.py
+++ b/scripts/evaluation/evaluate.py
@ -0,0 +1,165 @@
+import json
+import torch
+import pandas as pd
+
+# import apps.eval.reident
+
+from apps_utils.generate_gpt_codes import generate_prompt
+from apps_utils.test_one_solution import eval_and_save_problems
+from datasets import load_dataset, load_metric
+from fastcore.script import *
+from human_eval.data import write_jsonl, read_problems
+from pathlib import Path
+from metrics.extrinsic_eval import compute_metrics
+from subprocess import check_output
+from transformers import AutoTokenizer, AutoModelWithLMHead
+
+bleu = load_metric("sacrebleu")
+tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
+model = AutoModelWithLMHead.from_pretrained(
+    "/home/nathan/gpt-code-clippy/data/APPS/models/1.5B"
+)
+
+
+def generate_text(prompt):
+    # print(prompt)
+    input_ids = torch.LongTensor(tokenizer.encode(prompt, verbose=False)).unsqueeze(
+        0
+    )  # .cuda()
+    output_ids = model.generate(
+        input_ids,
+        num_beams=2,
+        early_stopping=True,
+        max_length=1024 - len(input_ids),
+    )
+    output_str = tokenizer.decode(output_ids[0])
+    return output_str
+    # # "a", "=", "b", "\n", "y", "=", "a", "+", "1"
+    # return "a = b \n y = a + 1"
+
+
+def _eval_concode(path):
+    # TODO: format input to model same as App and OpenAI HumanEval datasets are formatted
+    data = load_dataset("json", data_files=str(path / "test.json"))["train"]
+    predictions = [[]]
+    references = []
+    for example in data:
+        output = generate_text(example["nl"])
+        predictions[0].append(output.split(" "))
+        references.append(example["code"].split(" "))
+    results = compute_metrics(predictions, references)
+    print(f"Bleu score for Concode dataset: {results}")
+
+
+def _eval_apps(path):
+    gpt_codes = {}
+    prob_paths = sorted(path.glob("*/"))
+    # map prob_paths to strings and save as a json file
+    str_paths = [str(p) for p in prob_paths]
+    with open(path / "test.json", "w") as f:
+        json.dump(str_paths, f)
+    for index, prob_path in enumerate(prob_paths[:2]):
+        test_case_path = prob_path / "input_output.json"
+        prompt_path = prob_path / "question.txt"
+        starter_path = prob_path / "starter_code.py"
+        solutions_path = prob_path / "solutions.json"
+        if not starter_path.exists():
+            starter_path = None
+        if not test_case_path.exists() or not prompt_path.exists():
+            continue
+        prompt = generate_prompt(
+            Args(),
+            test_case_path,
+            prompt_path,
+            solutions_path,
+            tokenizer,
+            starter_path=starter_path,
+        )
+        output = generate_text(prompt)
+        print(output)
+        # print(output)
+        gpt_codes[index] = output
+        # print(output)
+
+    with open(path.parent / "all_codes.json", "w") as f:
+        json.dump(gpt_codes, f)
+
+    eval_and_save_problems(path, path.parent)
+
+    # execute bash command to run eval script
+    # results = check_output(
+    #     [
+    #         # python3 test_one_solution.py -t /path/to/apps/test --save /path/to/save_dir --print_results
+    #         "python",
+    #         "./apps_utils/test_one_solution.py",
+    #         "-t",
+    #         str(path),
+    #         "--save",
+    #         str(path.parent),
+    #         "--print_results",
+    #     ]
+    # ).decode("utf-8")
+
+
+#     test_case_path = os.path.join(prob_path, "input_output.json")
+#     prompt_path = os.path.join(prob_path, "question.txt")
+#     starter_path = os.path.join(prob_path, "starter_code.py")
+#     solutions_path = os.path.join(prob_path, "solutions.json")
+#  generate_prompt(args, test_case_path, prompt_path, solutions_path, tokenizer, starter_path=None)
+
+
+def _eval_human_eval(path):
+    problems = read_problems()
+    num_samples_per_task = 1
+    samples = [
+        dict(
+            task_id=task_id,
+            completion=generate_text(problems[task_id]["prompt"]),
+        )
+        for task_id in problems
+        for _ in range(num_samples_per_task)
+    ]
+    write_jsonl("human_eval.jsonl", samples)
+    # execute bash command to run eval script
+    results = check_output(
+        [
+            "python",
+            path / "evaluate_functional_correctness.py",
+            "human_eval.jsonl",
+        ]
+    ).decode("utf-8")
+
+    print(results)
+
+
+@call_parse
+def main(
+    concode_path: Param("Path to the concode data in CodeXGLUE", str),
+    apps_path: Param("Path to the the App dataset", str),
+    human_eval_path: Param("Path to the human eval dataset", str),
+):
+    concode_path = Path(concode_path)
+    apps_path = Path(apps_path)
+    human_eval_path = Path(human_eval_path)
+    # _eval_concode(concode_path)
+    # _eval_human_eval(human_eval_path)
+    _eval_apps(apps_path)
+    # dataset = load_dataset("json", data_files=str(concode_path / "test.json"))
+    # print(dataset)
+    # results = bleu.compute(predictions=predictions, references=references)
+    # print(list(results.keys()))
+    # print(round(results["score"], 1))
+
+
+# problems = read_problems()
+# print(problems)
+# num_samples_per_task = 200
+# samples = [
+#     dict(
+#         task_id=task_id,
+#         completion=generate_text(problems[task_id]["prompt"]),
+#     )
+#     for task_id in problems[:1]
+#     for _ in range(num_samples_per_task)
+# ]
+# write_jsonl("human_eval.jsonl", samples)
--- a/scripts/evaluation/human_eval.jsonl
+++ b/scripts/evaluation/human_eval.jsonl
@ -0,0 +1,164 @@
+{"task_id": "HumanEval/0", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/1", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/2", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/3", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/4", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/5", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/6", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/7", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/8", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/9", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/10", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/11", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/12", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/13", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/14", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/15", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/16", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/17", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/18", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/19", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/20", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/21", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/22", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/23", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/24", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/25", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/26", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/27", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/28", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/29", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/30", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/31", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/32", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/33", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/34", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/35", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/36", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/37", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/38", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/39", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/40", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/41", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/42", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/43", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/44", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/45", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/46", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/47", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/48", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/49", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/50", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/51", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/52", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/53", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/54", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/55", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/56", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/57", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/58", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/59", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/60", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/61", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/62", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/63", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/64", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/65", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/66", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/67", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/68", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/69", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/70", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/71", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/72", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/73", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/74", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/75", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/76", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/77", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/78", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/79", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/80", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/81", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/82", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/83", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/84", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/85", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/86", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/87", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/88", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/89", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/90", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/91", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/92", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/93", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/94", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/95", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/96", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/97", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/98", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/99", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/100", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/101", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/102", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/103", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/104", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/105", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/106", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/107", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/108", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/109", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/110", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/111", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/112", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/113", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/114", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/115", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/116", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/117", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/118", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/119", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/120", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/121", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/122", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/123", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/124", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/125", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/126", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/127", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/128", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/129", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/130", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/131", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/132", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/133", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/134", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/135", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/136", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/137", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/138", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/139", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/140", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/141", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/142", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/143", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/144", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/145", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/146", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/147", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/148", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/149", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/150", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/151", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/152", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/153", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/154", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/155", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/156", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/157", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/158", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/159", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/160", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/161", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/162", "completion": "a = b \n y = a + 1"}
+{"task_id": "HumanEval/163", "completion": "a = b \n y = a + 1"}
--- a/scripts/evaluation/human_eval.jsonl_results.jsonl
+++ b/scripts/evaluation/human_eval.jsonl_results.jsonl
@ -0,0 +1,164 @@
+{"task_id": "HumanEval/0", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
+{"task_id": "HumanEval/1", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
+{"task_id": "HumanEval/2", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
+{"task_id": "HumanEval/3", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
+{"task_id": "HumanEval/4", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
+{"task_id": "HumanEval/5", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
+{"task_id": "HumanEval/6", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
+{"task_id": "HumanEval/7", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
+{"task_id": "HumanEval/8", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
+{"task_id": "HumanEval/9", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
+{"task_id": "HumanEval/10", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 21)", "passed": false}
+{"task_id": "HumanEval/11", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
+{"task_id": "HumanEval/12", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
+{"task_id": "HumanEval/13", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
+{"task_id": "HumanEval/14", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 10)", "passed": false}
+{"task_id": "HumanEval/15", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
+{"task_id": "HumanEval/16", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
+{"task_id": "HumanEval/17", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 18)", "passed": false}
+{"task_id": "HumanEval/18", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
+{"task_id": "HumanEval/19", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
+{"task_id": "HumanEval/20", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
+{"task_id": "HumanEval/21", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
+{"task_id": "HumanEval/22", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
+{"task_id": "HumanEval/23", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
+{"task_id": "HumanEval/24", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 9)", "passed": false}
+{"task_id": "HumanEval/25", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
+{"task_id": "HumanEval/26", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
+{"task_id": "HumanEval/27", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 9)", "passed": false}
+{"task_id": "HumanEval/28", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
+{"task_id": "HumanEval/29", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
+{"task_id": "HumanEval/30", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
+{"task_id": "HumanEval/31", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 21)", "passed": false}
+{"task_id": "HumanEval/32", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 25)", "passed": false}
+{"task_id": "HumanEval/33", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
+{"task_id": "HumanEval/34", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 9)", "passed": false}
+{"task_id": "HumanEval/35", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
+{"task_id": "HumanEval/36", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
+{"task_id": "HumanEval/37", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
+{"task_id": "HumanEval/38", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 19)", "passed": false}
+{"task_id": "HumanEval/39", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 18)", "passed": false}
+{"task_id": "HumanEval/40", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 21)", "passed": false}
+{"task_id": "HumanEval/41", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
+{"task_id": "HumanEval/42", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
+{"task_id": "HumanEval/43", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 20)", "passed": false}
+{"task_id": "HumanEval/44", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
+{"task_id": "HumanEval/45", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 9)", "passed": false}
+{"task_id": "HumanEval/46", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 19)", "passed": false}
+{"task_id": "HumanEval/47", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
+{"task_id": "HumanEval/48", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
+{"task_id": "HumanEval/49", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
+{"task_id": "HumanEval/50", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
+{"task_id": "HumanEval/51", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 20)", "passed": false}
+{"task_id": "HumanEval/52", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
+{"task_id": "HumanEval/53", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
+{"task_id": "HumanEval/54", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 20)", "passed": false}
+{"task_id": "HumanEval/55", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
+{"task_id": "HumanEval/56", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
+{"task_id": "HumanEval/57", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
+{"task_id": "HumanEval/58", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
+{"task_id": "HumanEval/59", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
+{"task_id": "HumanEval/60", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
+{"task_id": "HumanEval/61", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
+{"task_id": "HumanEval/62", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
+{"task_id": "HumanEval/63", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 18)", "passed": false}
+{"task_id": "HumanEval/64", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 19)", "passed": false}
+{"task_id": "HumanEval/65", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
+{"task_id": "HumanEval/66", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
+{"task_id": "HumanEval/67", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
+{"task_id": "HumanEval/68", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 37)", "passed": false}
+{"task_id": "HumanEval/69", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
+{"task_id": "HumanEval/70", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
+{"task_id": "HumanEval/71", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
+{"task_id": "HumanEval/72", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 21)", "passed": false}
+{"task_id": "HumanEval/73", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
+{"task_id": "HumanEval/74", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
+{"task_id": "HumanEval/75", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
+{"task_id": "HumanEval/76", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
+{"task_id": "HumanEval/77", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
+{"task_id": "HumanEval/78", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 21)", "passed": false}
+{"task_id": "HumanEval/79", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
+{"task_id": "HumanEval/80", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
+{"task_id": "HumanEval/81", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 28)", "passed": false}
+{"task_id": "HumanEval/82", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
+{"task_id": "HumanEval/83", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 8)", "passed": false}
+{"task_id": "HumanEval/84", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
+{"task_id": "HumanEval/85", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 10)", "passed": false}
+{"task_id": "HumanEval/86", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
+{"task_id": "HumanEval/87", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 23)", "passed": false}
+{"task_id": "HumanEval/88", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 18)", "passed": false}
+{"task_id": "HumanEval/89", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
+{"task_id": "HumanEval/90", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
+{"task_id": "HumanEval/91", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
+{"task_id": "HumanEval/92", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 21)", "passed": false}
+{"task_id": "HumanEval/93", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
+{"task_id": "HumanEval/94", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
+{"task_id": "HumanEval/95", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
+{"task_id": "HumanEval/96", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
+{"task_id": "HumanEval/97", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
+{"task_id": "HumanEval/98", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
+{"task_id": "HumanEval/99", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 21)", "passed": false}
+{"task_id": "HumanEval/100", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
+{"task_id": "HumanEval/101", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
+{"task_id": "HumanEval/102", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
+{"task_id": "HumanEval/103", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
+{"task_id": "HumanEval/104", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
+{"task_id": "HumanEval/105", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 25)", "passed": false}
+{"task_id": "HumanEval/106", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
+{"task_id": "HumanEval/107", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 26)", "passed": false}
+{"task_id": "HumanEval/108", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
+{"task_id": "HumanEval/109", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 30)", "passed": false}
+{"task_id": "HumanEval/110", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
+{"task_id": "HumanEval/111", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
+{"task_id": "HumanEval/112", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
+{"task_id": "HumanEval/113", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
+{"task_id": "HumanEval/114", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
+{"task_id": "HumanEval/115", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 38)", "passed": false}
+{"task_id": "HumanEval/116", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
+{"task_id": "HumanEval/117", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
+{"task_id": "HumanEval/118", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 18)", "passed": false}
+{"task_id": "HumanEval/119", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 18)", "passed": false}
+{"task_id": "HumanEval/120", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 28)", "passed": false}
+{"task_id": "HumanEval/121", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
+{"task_id": "HumanEval/122", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
+{"task_id": "HumanEval/123", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 20)", "passed": false}
+{"task_id": "HumanEval/124", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 23)", "passed": false}
+{"task_id": "HumanEval/125", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
+{"task_id": "HumanEval/126", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 19)", "passed": false}
+{"task_id": "HumanEval/127", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 23)", "passed": false}
+{"task_id": "HumanEval/128", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
+{"task_id": "HumanEval/129", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 33)", "passed": false}
+{"task_id": "HumanEval/130", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 20)", "passed": false}
+{"task_id": "HumanEval/131", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
+{"task_id": "HumanEval/132", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
+{"task_id": "HumanEval/133", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
+{"task_id": "HumanEval/134", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
+{"task_id": "HumanEval/135", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
+{"task_id": "HumanEval/136", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
+{"task_id": "HumanEval/137", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
+{"task_id": "HumanEval/138", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 10)", "passed": false}
+{"task_id": "HumanEval/139", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
+{"task_id": "HumanEval/140", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
+{"task_id": "HumanEval/141", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
+{"task_id": "HumanEval/142", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 16)", "passed": false}
+{"task_id": "HumanEval/143", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 23)", "passed": false}
+{"task_id": "HumanEval/144", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
+{"task_id": "HumanEval/145", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
+{"task_id": "HumanEval/146", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 11)", "passed": false}
+{"task_id": "HumanEval/147", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 17)", "passed": false}
+{"task_id": "HumanEval/148", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 19)", "passed": false}
+{"task_id": "HumanEval/149", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 18)", "passed": false}
+{"task_id": "HumanEval/150", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 12)", "passed": false}
+{"task_id": "HumanEval/151", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 15)", "passed": false}
+{"task_id": "HumanEval/152", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 18)", "passed": false}
+{"task_id": "HumanEval/153", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 20)", "passed": false}
+{"task_id": "HumanEval/154", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
+{"task_id": "HumanEval/155", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 10)", "passed": false}
+{"task_id": "HumanEval/156", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
+{"task_id": "HumanEval/157", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
+{"task_id": "HumanEval/158", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
+{"task_id": "HumanEval/159", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 32)", "passed": false}
+{"task_id": "HumanEval/160", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 28)", "passed": false}
+{"task_id": "HumanEval/161", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 14)", "passed": false}
+{"task_id": "HumanEval/162", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 10)", "passed": false}
+{"task_id": "HumanEval/163", "completion": "a = b \n y = a + 1", "result": "failed: unexpected indent (<string>, line 13)", "passed": false}
--- a/scripts/evaluation/human_eval_bench.py
+++ b/scripts/evaluation/human_eval_bench.py
--- a/scripts/evaluation/metrics/bleu.py
+++ b/scripts/evaluation/metrics/bleu.py
@ -0,0 +1,133 @@
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# The following code is taken from CodeXGlue Repository - https://github.com/microsoft/CodeXGLUE/blob/main/Code-Code/code-to-code-trans/evaluator/CodeBLEU/bleu.py
+
+
+"""Python implementation of BLEU and smooth-BLEU.
+
+This module provides a Python implementation of BLEU and smooth-BLEU.
+Smooth BLEU is computed following the method outlined in the paper:
+Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
+evaluation metrics for machine translation. COLING 2004.
+"""
+
+import collections
+import math
+
+
+def _get_ngrams(segment, max_order):
+    """Extracts all n-grams upto a given maximum order from an input segment.
+
+    Args:
+      segment: text segment from which n-grams will be extracted.
+      max_order: maximum length in tokens of the n-grams returned by this
+          methods.
+
+    Returns:
+      The Counter containing all n-grams upto max_order in segment
+      with a count of how many times each n-gram occurred.
+    """
+    ngram_counts = collections.Counter()
+    for order in range(1, max_order + 1):
+        for i in range(0, len(segment) - order + 1):
+            ngram = tuple(segment[i : i + order])
+            ngram_counts[ngram] += 1
+    return ngram_counts
+
+
+def compute_bleu(reference_corpus, translation_corpus, max_order=4, smooth=True):
+    """Computes BLEU score of translated segments against one or more references.
+
+    Args:
+      reference_corpus: list of lists of references for each translation. Each
+          reference should be tokenized into a list of tokens.
+      translation_corpus: list of translations to score. Each translation
+          should be tokenized into a list of tokens.
+      max_order: Maximum n-gram order to use when computing BLEU score.
+      smooth: Whether or not to apply Lin et al. 2004 smoothing.
+
+    Returns:
+      3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
+      precisions and brevity penalty.
+    """
+    matches_by_order = [0] * max_order
+    possible_matches_by_order = [0] * max_order
+    reference_length = 0
+    translation_length = 0
+    for (references, translation) in zip(reference_corpus, translation_corpus):
+        reference_length += min(len(r) for r in references)
+        translation_length += len(translation)
+
+        merged_ref_ngram_counts = collections.Counter()
+        for reference in references:
+            merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
+        translation_ngram_counts = _get_ngrams(translation, max_order)
+        overlap = translation_ngram_counts & merged_ref_ngram_counts
+        for ngram in overlap:
+            matches_by_order[len(ngram) - 1] += overlap[ngram]
+        for order in range(1, max_order + 1):
+            possible_matches = len(translation) - order + 1
+            if possible_matches > 0:
+                possible_matches_by_order[order - 1] += possible_matches
+
+    precisions = [0] * max_order
+    for i in range(0, max_order):
+        if smooth:
+            precisions[i] = (matches_by_order[i] + 1.0) / (
+                possible_matches_by_order[i] + 1.0
+            )
+        else:
+            if possible_matches_by_order[i] > 0:
+                precisions[i] = (
+                    float(matches_by_order[i]) / possible_matches_by_order[i]
+                )
+            else:
+                precisions[i] = 0.0
+
+    if min(precisions) > 0:
+        p_log_sum = sum((1.0 / max_order) * math.log(p) for p in precisions)
+        geo_mean = math.exp(p_log_sum)
+    else:
+        geo_mean = 0
+
+    ratio = float(translation_length) / reference_length
+
+    if ratio > 1.0:
+        bp = 1.0
+    else:
+        bp = math.exp(1 - 1.0 / ratio)
+    bleu = geo_mean * bp
+    bleu_score_dict = {
+        "bleu": bleu,
+        "precision": precisions,
+        "bp": bp,
+        "ratio": ratio,
+        "trans_len": translation_length,
+        "ref_len": reference_length,
+    }
+    return bleu_score_dict  # (bleu, precisions, bp, ratio, translation_length, reference_length)
+
+
+def bleu_test_case():
+    """A simple functionality test case to evaluate BLEU"""
+    generated = [[["a", "=", "b", "\n", "y", "=", "a", "+", "1"]]]
+    reference = [["a", "=", "b", "\n", "print", "a"]]
+    score_dict = compute_bleu(generated, reference, smooth=False)
+    return score_dict
+
+
+if __name__ == "__main__":
+    score_dict = bleu_test_case()
+    print(score_dict)
--- a/scripts/evaluation/metrics/extrinsic_eval.py
+++ b/scripts/evaluation/metrics/extrinsic_eval.py
@ -0,0 +1,46 @@
+from metrics.bleu import compute_bleu
+
+
+def compute_exact_match(references, generated) -> float:
+    """
+    Computes Exact Match Accuracy.
+    args:
+        reference: list of lists of references for each translation. Each
+          reference should be tokenized into a list of tokens.
+        translation: list of translations to score. Each translation
+          should be tokenized into a list of tokens.
+    returns:
+        exact_match_accuracy : Float
+    """
+    assert (
+        len(references[0]) == len(generated),
+        "Number of Samples should be equal in References and Synthesized Outputs..",
+    )
+    exact_match_count = 0.0
+    for gen, ref in zip(generated, references[0]):
+        if gen == ref:
+            exact_match_count += 1
+    exact_match_acc = exact_match_count / len(generated)
+    return exact_match_acc
+
+
+def compute_metrics(references, generated) -> dict:
+    """
+    Calculates various metrics and returns the calculated dict of these matrics.
+    args:
+        reference: list of lists of references for each translation. Each
+          reference should be tokenized into a list of tokens.
+        translation: list of translations to score. Each translation
+          should be tokenized into a list of tokens.
+    returns:
+        A dicitonary with different metrics intact.
+    """
+    metrics_dict = {
+        "smoothed_bleu_4": None,
+        "bleu_4": None,
+        "exact_match_acc": None,
+    }  # Update as in new metrics are computed.
+    metrics_dict["smoothed_bleu_4"] = compute_bleu(references, generated, smooth=True)
+    metrics_dict["bleu_4"] = compute_bleu(references, generated, smooth=False)
+    metrics_dict["exact_match_acc"] = compute_exact_match(references, generated)
+    return metrics_dict
--- a/scripts/get_license_info.py
+++ b/scripts/get_license_info.py
@ -0,0 +1,31 @@
+import os
+
+import pandas as pd
+
+from fastcore.script import *
+from ghapi.all import GhApi
+
+GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN")
+
+
+# Open issue on repo using custom title and body
+def get_license_info(owner, repo):
+    api = GhApi(owner=owner, repo=repo, token=GITHUB_TOKEN)
+    license = api.licenses.get_for_repo(owner=owner, repo=repo)
+    return license.license.name
+
+@call_parse
+def main(repos_path: Param("Path to the csv containing all of the repos", str)):
+    """
+    Use pandas dataframe from the repos path to open issues in each of them.
+    """
+    repos_path = Path(repos_path)
+    df = pd.read_csv(repos_path)
+
+    # Loop through repos and get their license
+    licenses = []
+    for _, row in df.iterrows():
+        owner, repo = row["name"].split("/")
+        licenses.append(get_license_info(owner, repo))
+    df["license"] = licenses
+    df.to_csv(repos_path.parent/f"{repos_path.stem}_with_license.csv", index=False)
				`@ -0,0 +1 @@`
				`Subproject commit 3e7bfe6dc4a88534c7803ce1bd8d1733c1d16888`
				`@ -0,0 +1 @@`
				`Subproject commit f834ca7d7405935376aabb5830edd0c42635824e`
				`@ -0,0 +1 @@`
				`Subproject commit 463c980b59e818ace59f6f9803cd92c749ceae61`