English
Llama-slideQA / utils /.ipynb_checkpoints /eval_utils-checkpoint.py
weiheng-1009's picture
added code for running
cbff41a
from collections import defaultdict
import re
import math
from rouge import Rouge
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
def compute_bleu_scores(candidate_list, reference_list, avg=False):
bleu_scores = []
smoothie = SmoothingFunction().method1
for candidate, references in zip(candidate_list, reference_list):
candidate_tokens = candidate.split()
reference_tokens = [reference.split() for reference in references]
# Determine the maximum order of n-grams we can consider
min_length = min(len(reference) for reference in reference_tokens)
if min_length > 4:
weights = (0.25, 0.25, 0.25, 0.25) # Use 3-gram if reference sentences are long enough
else:
# Adjust weights based on the minimum length of the reference sentences
weights = tuple(1/min_length for _ in range(min_length)) + tuple(0 for _ in range(4 - min_length))
# Calculate BLEU score with dynamic weights
bleu = sentence_bleu(reference_tokens, candidate_tokens, weights=weights, smoothing_function=smoothie)
bleu_scores.append(bleu)
if avg:
return np.mean(bleu_scores)
else:
return bleu_scores
def calculate_rouge(candidate, reference):
rouge = Rouge()
'''
candidate, reference: generated and ground-truth sentences
'''
scores = rouge.get_scores([candidate], reference)
return scores
def brevity_penalty(candidate, references):
c = len(candidate)
ref_lens = (len(reference) for reference in references)
r = min(ref_lens, key=lambda ref_len: (abs(ref_len - c), ref_len))
if c > r:
return 1
else:
return math.exp(1 - r / c)
def modified_precision(candidate, references, n):
max_frequency = defaultdict(int)
min_frequency = defaultdict(int)
candidate_words = split_sentence(candidate, n)
for reference in references:
reference_words = split_sentence(reference, n)
for word in candidate_words:
max_frequency[word] = max(max_frequency[word], reference_words[word])
for word in candidate_words:
min_frequency[word] = min(max_frequency[word], candidate_words[word])
P = sum(min_frequency.values()) / sum(candidate_words.values())
return P
def split_sentence(sentence, n):
words = defaultdict(int)
tmp_sentence = re.sub("[^a-zA-Z ]", "", sentence)
tmp_sentence = tmp_sentence.lower()
tmp_sentence = tmp_sentence.strip().split()
length = len(tmp_sentence)
for i in range(length - n + 1):
tmp_words = " ".join(tmp_sentence[i: i + n])
if tmp_words:
words[tmp_words] += 1
return words
def bleu(candidate, references, n, weights):
pn = []
bp = brevity_penalty(candidate, references)
for i in range(n):
pn.append(modified_precision(candidate, references, i + 1))
if len(weights) > len(pn):
tmp_weights = []
for i in range(len(pn)):
tmp_weights.append(weights[i])
bleu_result = calculate_bleu(tmp_weights, pn, n, bp)
return str(bleu_result) + " (warning: the length of weights is bigger than n)"
elif len(weights) < len(pn):
tmp_weights = []
for i in range(len(pn)):
tmp_weights.append(0)
for i in range(len(weights)):
tmp_weights[i] = weights[i]
bleu_result = calculate_bleu(tmp_weights, pn, n, bp)
return str(bleu_result) + " (warning: the length of weights is smaller than n)"
else:
bleu_result = calculate_bleu(weights, pn, n, bp)
return str(bleu_result)
#BLEU
def calculate_bleu(weights, pn, n, bp):
sum_wlogp = 0
for i in range(n):
if pn[i] != 0:
sum_wlogp += float(weights[i]) * math.log(pn[i])
bleu_result = bp * math.exp(sum_wlogp)
return bleu_result
#Exact match
def calculate_exactmatch(candidate, reference):
candidate_words = split_sentence(candidate, 1)
reference_words = split_sentence(reference, 1)
count = 0
total = 0
for word in reference_words:
if word in candidate_words:
count += 1
for word in candidate_words:
total += candidate_words[word]
if total == 0:
return "0 (warning: length of candidate's words is 0)"
else:
return count / total
#F1
def calculate_f1score(candidate, reference):
candidate_words = split_sentence(candidate, 1)
reference_words = split_sentence(reference, 1)
word_set = set()
for word in candidate_words:
word_set.add(word)
for word in reference_words:
word_set.add(word)
tp = 0
fp = 0
fn = 0
for word in word_set:
if word in candidate_words and word in reference_words:
tp += candidate_words[word]
elif word in candidate_words and word not in reference_words:
fp += candidate_words[word]
elif word not in candidate_words and word in reference_words:
fn += reference_words[word]
if len(candidate_words) == 0:
return "0 (warning: length of candidate's words is 0)"
elif len(reference_words) == 0:
return 0
else:
precision = tp / (tp + fp)
recall = tp / (tp + fn)
if tp == 0:
return 0
else:
return 2 * precision * recall / (precision + recall)