diff --git a/metrics/bleu.py b/metrics/bleu.py
index 36b7839..17a06ce 100644
--- a/metrics/bleu.py
+++ b/metrics/bleu.py
@@ -28,98 +28,106 @@ import math
 
 
 def _get_ngrams(segment, max_order):
-  """Extracts all n-grams upto a given maximum order from an input segment.
+    """Extracts all n-grams upto a given maximum order from an input segment.
 
-  Args:
-    segment: text segment from which n-grams will be extracted.
-    max_order: maximum length in tokens of the n-grams returned by this
-        methods.
+    Args:
+      segment: text segment from which n-grams will be extracted.
+      max_order: maximum length in tokens of the n-grams returned by this
+          methods.
 
-  Returns:
-    The Counter containing all n-grams upto max_order in segment
-    with a count of how many times each n-gram occurred.
-  """
-  ngram_counts = collections.Counter()
-  for order in range(1, max_order + 1):
-    for i in range(0, len(segment) - order + 1):
-      ngram = tuple(segment[i:i+order])
-      ngram_counts[ngram] += 1
-  return ngram_counts
+    Returns:
+      The Counter containing all n-grams upto max_order in segment
+      with a count of how many times each n-gram occurred.
+    """
+    ngram_counts = collections.Counter()
+    for order in range(1, max_order + 1):
+        for i in range(0, len(segment) - order + 1):
+            ngram = tuple(segment[i : i + order])
+            ngram_counts[ngram] += 1
+    return ngram_counts
 
 
-def compute_bleu(reference_corpus, translation_corpus, max_order=4,
-                 smooth=True):
-  """Computes BLEU score of translated segments against one or more references.
+def compute_bleu(reference_corpus, translation_corpus, max_order=4, smooth=True):
+    """Computes BLEU score of translated segments against one or more references.
 
-  Args:
-    reference_corpus: list of lists of references for each translation. Each
-        reference should be tokenized into a list of tokens.
-    translation_corpus: list of translations to score. Each translation
-        should be tokenized into a list of tokens.
-    max_order: Maximum n-gram order to use when computing BLEU score.
-    smooth: Whether or not to apply Lin et al. 2004 smoothing.
+    Args:
+      reference_corpus: list of lists of references for each translation. Each
+          reference should be tokenized into a list of tokens.
+      translation_corpus: list of translations to score. Each translation
+          should be tokenized into a list of tokens.
+      max_order: Maximum n-gram order to use when computing BLEU score.
+      smooth: Whether or not to apply Lin et al. 2004 smoothing.
 
-  Returns:
-    3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
-    precisions and brevity penalty.
-  """
-  matches_by_order = [0] * max_order
-  possible_matches_by_order = [0] * max_order
-  reference_length = 0
-  translation_length = 0
-  for (references, translation) in zip(reference_corpus,
-                                       translation_corpus):
-    reference_length += min(len(r) for r in references)
-    translation_length += len(translation)
+    Returns:
+      3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
+      precisions and brevity penalty.
+    """
+    matches_by_order = [0] * max_order
+    possible_matches_by_order = [0] * max_order
+    reference_length = 0
+    translation_length = 0
+    for (references, translation) in zip(reference_corpus, translation_corpus):
+        reference_length += min(len(r) for r in references)
+        translation_length += len(translation)
 
-    merged_ref_ngram_counts = collections.Counter()
-    for reference in references:
-      merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
-    translation_ngram_counts = _get_ngrams(translation, max_order)
-    overlap = translation_ngram_counts & merged_ref_ngram_counts
-    for ngram in overlap:
-      matches_by_order[len(ngram)-1] += overlap[ngram]
-    for order in range(1, max_order+1):
-      possible_matches = len(translation) - order + 1
-      if possible_matches > 0:
-        possible_matches_by_order[order-1] += possible_matches
+        merged_ref_ngram_counts = collections.Counter()
+        for reference in references:
+            merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
+        translation_ngram_counts = _get_ngrams(translation, max_order)
+        overlap = translation_ngram_counts & merged_ref_ngram_counts
+        for ngram in overlap:
+            matches_by_order[len(ngram) - 1] += overlap[ngram]
+        for order in range(1, max_order + 1):
+            possible_matches = len(translation) - order + 1
+            if possible_matches > 0:
+                possible_matches_by_order[order - 1] += possible_matches
 
-  precisions = [0] * max_order
-  for i in range(0, max_order):
-    if smooth:
-      precisions[i] = ((matches_by_order[i] + 1.) /
-                       (possible_matches_by_order[i] + 1.))
+    precisions = [0] * max_order
+    for i in range(0, max_order):
+        if smooth:
+            precisions[i] = (matches_by_order[i] + 1.0) / (
+                possible_matches_by_order[i] + 1.0
+            )
+        else:
+            if possible_matches_by_order[i] > 0:
+                precisions[i] = (
+                    float(matches_by_order[i]) / possible_matches_by_order[i]
+                )
+            else:
+                precisions[i] = 0.0
+
+    if min(precisions) > 0:
+        p_log_sum = sum((1.0 / max_order) * math.log(p) for p in precisions)
+        geo_mean = math.exp(p_log_sum)
     else:
-      if possible_matches_by_order[i] > 0:
-        precisions[i] = (float(matches_by_order[i]) /
-                         possible_matches_by_order[i])
-      else:
-        precisions[i] = 0.0
+        geo_mean = 0
 
-  if min(precisions) > 0:
-    p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
-    geo_mean = math.exp(p_log_sum)
-  else:
-    geo_mean = 0
+    ratio = float(translation_length) / reference_length
 
-  ratio = float(translation_length) / reference_length
+    if ratio > 1.0:
+        bp = 1.0
+    else:
+        bp = math.exp(1 - 1.0 / ratio)
+    bleu = geo_mean * bp
+    bleu_score_dict = {
+        "bleu": bleu,
+        "precision": precisions,
+        "bp": bp,
+        "ratio": ratio,
+        "trans_len": translation_length,
+        "ref_len": reference_length,
+    }
+    return bleu_score_dict  # (bleu, precisions, bp, ratio, translation_length, reference_length)
 
-  if ratio > 1.0:
-    bp = 1.
-  else:
-    bp = math.exp(1 - 1. / ratio)
-  bleu = geo_mean * bp
-  print(geo_mean)
-  bleu_score_dict = {"bleu":bleu,"precision":precisions,"bp":bp,"ratio":ratio,"trans_len":translation_length,"ref_len":reference_length}
-  return bleu_score_dict#(bleu, precisions, bp, ratio, translation_length, reference_length)
 
 def bleu_test_case():
     """A simple functionality test case to evaluate BLEU"""
-    generated = [[["a","=","b","\n","y","=","a","+","1"]]]
-    reference = [["a","=","b","\n","print","a"]]
-    score_dict = compute_bleu(generated,reference,smooth=False)
+    generated = [[["a", "=", "b", "\n", "y", "=", "a", "+", "1"]]]
+    reference = [["a", "=", "b", "\n", "print", "a"]]
+    score_dict = compute_bleu(generated, reference, smooth=False)
     return score_dict
 
+
 if __name__ == "__main__":
     score_dict = bleu_test_case()
-    print(score_dict)
\ No newline at end of file
+    print(score_dict)
diff --git a/metrics/extrinsic_eval.py b/metrics/extrinsic_eval.py
index bd6e81a..c414912 100644
--- a/metrics/extrinsic_eval.py
+++ b/metrics/extrinsic_eval.py
@@ -1,7 +1,7 @@
 from metrics.bleu import compute_bleu
 
 
-def compute_exact_match(references,generated)->float:
+def compute_exact_match(references, generated) -> float:
     """
     Computes Exact Match Accuracy.
     args:
@@ -12,15 +12,19 @@ def compute_exact_match(references,generated)->float:
     returns:
         exact_match_accuracy : Float
     """
-    assert(len(references[0])==len(generated),"Number of Samples should be equal in References and Synthesized Outputs..")
+    assert (
+        len(references[0]) == len(generated),
+        "Number of Samples should be equal in References and Synthesized Outputs..",
+    )
     exact_match_count = 0.0
-    for gen,ref in zip(generated, references[0]):
+    for gen, ref in zip(generated, references[0]):
         if gen == ref:
             exact_match_count += 1
-    exact_match_acc = exact_match_count/len(generated)
+    exact_match_acc = exact_match_count / len(generated)
     return exact_match_acc
 
-def compute_metrics(references,generated) -> dict:
+
+def compute_metrics(references, generated) -> dict:
     """
     Calculates various metrics and returns the calculated dict of these matrics.
     args:
@@ -31,8 +35,12 @@ def compute_metrics(references,generated) -> dict:
     returns:
         A dicitonary with different metrics intact.
     """
-    metrics_dict = {} #Update as in new metrics are added over here.
-    metrics_dict["smoothed_bleu_4"] = compute_bleu(references,generated,smooth=True)
-    metrics_dict["bleu_4"] = compute_bleu(references,generated,smooth=False)
-    metrics_dict["exact_match_acc"] = compute_exact_match(references,generated)
-    return metrics_dict
\ No newline at end of file
+    metrics_dict = {
+        "smoothed_bleu_4": None,
+        "blue_4": None,
+        "exact_match_acc": None,
+    }  # Update as in new metrics are computed.
+    metrics_dict["smoothed_bleu_4"] = compute_bleu(references, generated, smooth=True)
+    metrics_dict["bleu_4"] = compute_bleu(references, generated, smooth=False)
+    metrics_dict["exact_match_acc"] = compute_exact_match(references, generated)
+    return metrics_dict