diff --git a/metrics/bleu.py b/metrics/bleu.py index 36b7839..17a06ce 100644 --- a/metrics/bleu.py +++ b/metrics/bleu.py @@ -28,98 +28,106 @@ import math def _get_ngrams(segment, max_order): - """Extracts all n-grams upto a given maximum order from an input segment. + """Extracts all n-grams upto a given maximum order from an input segment. - Args: - segment: text segment from which n-grams will be extracted. - max_order: maximum length in tokens of the n-grams returned by this - methods. + Args: + segment: text segment from which n-grams will be extracted. + max_order: maximum length in tokens of the n-grams returned by this + methods. - Returns: - The Counter containing all n-grams upto max_order in segment - with a count of how many times each n-gram occurred. - """ - ngram_counts = collections.Counter() - for order in range(1, max_order + 1): - for i in range(0, len(segment) - order + 1): - ngram = tuple(segment[i:i+order]) - ngram_counts[ngram] += 1 - return ngram_counts + Returns: + The Counter containing all n-grams upto max_order in segment + with a count of how many times each n-gram occurred. + """ + ngram_counts = collections.Counter() + for order in range(1, max_order + 1): + for i in range(0, len(segment) - order + 1): + ngram = tuple(segment[i : i + order]) + ngram_counts[ngram] += 1 + return ngram_counts -def compute_bleu(reference_corpus, translation_corpus, max_order=4, - smooth=True): - """Computes BLEU score of translated segments against one or more references. +def compute_bleu(reference_corpus, translation_corpus, max_order=4, smooth=True): + """Computes BLEU score of translated segments against one or more references. - Args: - reference_corpus: list of lists of references for each translation. Each - reference should be tokenized into a list of tokens. - translation_corpus: list of translations to score. Each translation - should be tokenized into a list of tokens. - max_order: Maximum n-gram order to use when computing BLEU score. - smooth: Whether or not to apply Lin et al. 2004 smoothing. + Args: + reference_corpus: list of lists of references for each translation. Each + reference should be tokenized into a list of tokens. + translation_corpus: list of translations to score. Each translation + should be tokenized into a list of tokens. + max_order: Maximum n-gram order to use when computing BLEU score. + smooth: Whether or not to apply Lin et al. 2004 smoothing. - Returns: - 3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram - precisions and brevity penalty. - """ - matches_by_order = [0] * max_order - possible_matches_by_order = [0] * max_order - reference_length = 0 - translation_length = 0 - for (references, translation) in zip(reference_corpus, - translation_corpus): - reference_length += min(len(r) for r in references) - translation_length += len(translation) + Returns: + 3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram + precisions and brevity penalty. + """ + matches_by_order = [0] * max_order + possible_matches_by_order = [0] * max_order + reference_length = 0 + translation_length = 0 + for (references, translation) in zip(reference_corpus, translation_corpus): + reference_length += min(len(r) for r in references) + translation_length += len(translation) - merged_ref_ngram_counts = collections.Counter() - for reference in references: - merged_ref_ngram_counts |= _get_ngrams(reference, max_order) - translation_ngram_counts = _get_ngrams(translation, max_order) - overlap = translation_ngram_counts & merged_ref_ngram_counts - for ngram in overlap: - matches_by_order[len(ngram)-1] += overlap[ngram] - for order in range(1, max_order+1): - possible_matches = len(translation) - order + 1 - if possible_matches > 0: - possible_matches_by_order[order-1] += possible_matches + merged_ref_ngram_counts = collections.Counter() + for reference in references: + merged_ref_ngram_counts |= _get_ngrams(reference, max_order) + translation_ngram_counts = _get_ngrams(translation, max_order) + overlap = translation_ngram_counts & merged_ref_ngram_counts + for ngram in overlap: + matches_by_order[len(ngram) - 1] += overlap[ngram] + for order in range(1, max_order + 1): + possible_matches = len(translation) - order + 1 + if possible_matches > 0: + possible_matches_by_order[order - 1] += possible_matches - precisions = [0] * max_order - for i in range(0, max_order): - if smooth: - precisions[i] = ((matches_by_order[i] + 1.) / - (possible_matches_by_order[i] + 1.)) + precisions = [0] * max_order + for i in range(0, max_order): + if smooth: + precisions[i] = (matches_by_order[i] + 1.0) / ( + possible_matches_by_order[i] + 1.0 + ) + else: + if possible_matches_by_order[i] > 0: + precisions[i] = ( + float(matches_by_order[i]) / possible_matches_by_order[i] + ) + else: + precisions[i] = 0.0 + + if min(precisions) > 0: + p_log_sum = sum((1.0 / max_order) * math.log(p) for p in precisions) + geo_mean = math.exp(p_log_sum) else: - if possible_matches_by_order[i] > 0: - precisions[i] = (float(matches_by_order[i]) / - possible_matches_by_order[i]) - else: - precisions[i] = 0.0 + geo_mean = 0 - if min(precisions) > 0: - p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions) - geo_mean = math.exp(p_log_sum) - else: - geo_mean = 0 + ratio = float(translation_length) / reference_length - ratio = float(translation_length) / reference_length + if ratio > 1.0: + bp = 1.0 + else: + bp = math.exp(1 - 1.0 / ratio) + bleu = geo_mean * bp + bleu_score_dict = { + "bleu": bleu, + "precision": precisions, + "bp": bp, + "ratio": ratio, + "trans_len": translation_length, + "ref_len": reference_length, + } + return bleu_score_dict # (bleu, precisions, bp, ratio, translation_length, reference_length) - if ratio > 1.0: - bp = 1. - else: - bp = math.exp(1 - 1. / ratio) - bleu = geo_mean * bp - print(geo_mean) - bleu_score_dict = {"bleu":bleu,"precision":precisions,"bp":bp,"ratio":ratio,"trans_len":translation_length,"ref_len":reference_length} - return bleu_score_dict#(bleu, precisions, bp, ratio, translation_length, reference_length) def bleu_test_case(): """A simple functionality test case to evaluate BLEU""" - generated = [[["a","=","b","\n","y","=","a","+","1"]]] - reference = [["a","=","b","\n","print","a"]] - score_dict = compute_bleu(generated,reference,smooth=False) + generated = [[["a", "=", "b", "\n", "y", "=", "a", "+", "1"]]] + reference = [["a", "=", "b", "\n", "print", "a"]] + score_dict = compute_bleu(generated, reference, smooth=False) return score_dict + if __name__ == "__main__": score_dict = bleu_test_case() - print(score_dict) \ No newline at end of file + print(score_dict) diff --git a/metrics/extrinsic_eval.py b/metrics/extrinsic_eval.py index bd6e81a..c414912 100644 --- a/metrics/extrinsic_eval.py +++ b/metrics/extrinsic_eval.py @@ -1,7 +1,7 @@ from metrics.bleu import compute_bleu -def compute_exact_match(references,generated)->float: +def compute_exact_match(references, generated) -> float: """ Computes Exact Match Accuracy. args: @@ -12,15 +12,19 @@ def compute_exact_match(references,generated)->float: returns: exact_match_accuracy : Float """ - assert(len(references[0])==len(generated),"Number of Samples should be equal in References and Synthesized Outputs..") + assert ( + len(references[0]) == len(generated), + "Number of Samples should be equal in References and Synthesized Outputs..", + ) exact_match_count = 0.0 - for gen,ref in zip(generated, references[0]): + for gen, ref in zip(generated, references[0]): if gen == ref: exact_match_count += 1 - exact_match_acc = exact_match_count/len(generated) + exact_match_acc = exact_match_count / len(generated) return exact_match_acc -def compute_metrics(references,generated) -> dict: + +def compute_metrics(references, generated) -> dict: """ Calculates various metrics and returns the calculated dict of these matrics. args: @@ -31,8 +35,12 @@ def compute_metrics(references,generated) -> dict: returns: A dicitonary with different metrics intact. """ - metrics_dict = {} #Update as in new metrics are added over here. - metrics_dict["smoothed_bleu_4"] = compute_bleu(references,generated,smooth=True) - metrics_dict["bleu_4"] = compute_bleu(references,generated,smooth=False) - metrics_dict["exact_match_acc"] = compute_exact_match(references,generated) - return metrics_dict \ No newline at end of file + metrics_dict = { + "smoothed_bleu_4": None, + "blue_4": None, + "exact_match_acc": None, + } # Update as in new metrics are computed. + metrics_dict["smoothed_bleu_4"] = compute_bleu(references, generated, smooth=True) + metrics_dict["bleu_4"] = compute_bleu(references, generated, smooth=False) + metrics_dict["exact_match_acc"] = compute_exact_match(references, generated) + return metrics_dict