File size: 5,449 Bytes
cbff41a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
from collections import defaultdict
import re
import math
from rouge import Rouge
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
def compute_bleu_scores(candidate_list, reference_list, avg=False):
bleu_scores = []
smoothie = SmoothingFunction().method1
for candidate, references in zip(candidate_list, reference_list):
candidate_tokens = candidate.split()
reference_tokens = [reference.split() for reference in references]
# Determine the maximum order of n-grams we can consider
min_length = min(len(reference) for reference in reference_tokens)
if min_length > 4:
weights = (0.25, 0.25, 0.25, 0.25) # Use 3-gram if reference sentences are long enough
else:
# Adjust weights based on the minimum length of the reference sentences
weights = tuple(1/min_length for _ in range(min_length)) + tuple(0 for _ in range(4 - min_length))
# Calculate BLEU score with dynamic weights
bleu = sentence_bleu(reference_tokens, candidate_tokens, weights=weights, smoothing_function=smoothie)
bleu_scores.append(bleu)
if avg:
return np.mean(bleu_scores)
else:
return bleu_scores
def calculate_rouge(candidate, reference):
rouge = Rouge()
'''
candidate, reference: generated and ground-truth sentences
'''
scores = rouge.get_scores([candidate], reference)
return scores
def brevity_penalty(candidate, references):
c = len(candidate)
ref_lens = (len(reference) for reference in references)
r = min(ref_lens, key=lambda ref_len: (abs(ref_len - c), ref_len))
if c > r:
return 1
else:
return math.exp(1 - r / c)
def modified_precision(candidate, references, n):
max_frequency = defaultdict(int)
min_frequency = defaultdict(int)
candidate_words = split_sentence(candidate, n)
for reference in references:
reference_words = split_sentence(reference, n)
for word in candidate_words:
max_frequency[word] = max(max_frequency[word], reference_words[word])
for word in candidate_words:
min_frequency[word] = min(max_frequency[word], candidate_words[word])
P = sum(min_frequency.values()) / sum(candidate_words.values())
return P
def split_sentence(sentence, n):
words = defaultdict(int)
tmp_sentence = re.sub("[^a-zA-Z ]", "", sentence)
tmp_sentence = tmp_sentence.lower()
tmp_sentence = tmp_sentence.strip().split()
length = len(tmp_sentence)
for i in range(length - n + 1):
tmp_words = " ".join(tmp_sentence[i: i + n])
if tmp_words:
words[tmp_words] += 1
return words
def bleu(candidate, references, n, weights):
pn = []
bp = brevity_penalty(candidate, references)
for i in range(n):
pn.append(modified_precision(candidate, references, i + 1))
if len(weights) > len(pn):
tmp_weights = []
for i in range(len(pn)):
tmp_weights.append(weights[i])
bleu_result = calculate_bleu(tmp_weights, pn, n, bp)
return str(bleu_result) + " (warning: the length of weights is bigger than n)"
elif len(weights) < len(pn):
tmp_weights = []
for i in range(len(pn)):
tmp_weights.append(0)
for i in range(len(weights)):
tmp_weights[i] = weights[i]
bleu_result = calculate_bleu(tmp_weights, pn, n, bp)
return str(bleu_result) + " (warning: the length of weights is smaller than n)"
else:
bleu_result = calculate_bleu(weights, pn, n, bp)
return str(bleu_result)
#BLEU
def calculate_bleu(weights, pn, n, bp):
sum_wlogp = 0
for i in range(n):
if pn[i] != 0:
sum_wlogp += float(weights[i]) * math.log(pn[i])
bleu_result = bp * math.exp(sum_wlogp)
return bleu_result
#Exact match
def calculate_exactmatch(candidate, reference):
candidate_words = split_sentence(candidate, 1)
reference_words = split_sentence(reference, 1)
count = 0
total = 0
for word in reference_words:
if word in candidate_words:
count += 1
for word in candidate_words:
total += candidate_words[word]
if total == 0:
return "0 (warning: length of candidate's words is 0)"
else:
return count / total
#F1
def calculate_f1score(candidate, reference):
candidate_words = split_sentence(candidate, 1)
reference_words = split_sentence(reference, 1)
word_set = set()
for word in candidate_words:
word_set.add(word)
for word in reference_words:
word_set.add(word)
tp = 0
fp = 0
fn = 0
for word in word_set:
if word in candidate_words and word in reference_words:
tp += candidate_words[word]
elif word in candidate_words and word not in reference_words:
fp += candidate_words[word]
elif word not in candidate_words and word in reference_words:
fn += reference_words[word]
if len(candidate_words) == 0:
return "0 (warning: length of candidate's words is 0)"
elif len(reference_words) == 0:
return 0
else:
precision = tp / (tp + fp)
recall = tp / (tp + fn)
if tp == 0:
return 0
else:
return 2 * precision * recall / (precision + recall) |