In [ ]:
Copied!
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Evaluate Classification¶
Author(s) | Renato Leite (renatoleite@), Egon Soares (egon@) |
Last updated | 09/05/2023 |
Per Class¶
- Dataset used for this sample CARER: Contextualized Affect Representations for Emotion Recognition by Elvis Saravia, Hsien-Chi Toby Liu, Yen-Hao Huang, Junlin Wu, and Yi-Shin Chen. In Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, pages 3687-3697, Brussels, Belgium, October-November 2018. Association for Computational Linguistics.
In [1]:
Copied!
# from https://github.com/dair-ai/emotion_dataset - modified to binary classification
texts = [
'i left with my bouquet of red and yellow tulips under my arm feeling slightly more optimistic than when i arrived',
'i explain why i clung to a relationship with a boy who was in many ways immature and uncommitted despite the excitement i should have been feeling for getting accepted into the masters program at the university of virginia',
'i like to have the same breathless feeling as a reader eager to see what will happen next',
'i jest i feel grumpy tired and pre menstrual which i probably am but then again its only been a week and im about as fit as a walrus on vacation for the summer',
'i don t feel particularly agitated',
'i feel beautifully emotional knowing that these women of whom i knew just a handful were holding me and my baba on our journey',
'i pay attention it deepens into a feeling of being invaded and helpless',
'i just feel extremely comfortable with the group of people that i dont even need to hide myself',
'i find myself in the odd position of feeling supportive of',
'i was feeling as heartbroken as im sure katniss was',
'i feel a little mellow today',
'i feel like my only role now would be to tear your sails with my pessimism and discontent',
'i feel just bcoz a fight we get mad to each other n u wanna make a publicity n let the world knows about our fight',
'i feel like reds and purples are just so rich and kind of perfect']
# Positive Sentiment = 1
# Negative Sentiment = 0
ground_truth = [ 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1]
# Sample prediction
predicted = [ 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1]
# from https://github.com/dair-ai/emotion_dataset - modified to binary classification
texts = [
'i left with my bouquet of red and yellow tulips under my arm feeling slightly more optimistic than when i arrived',
'i explain why i clung to a relationship with a boy who was in many ways immature and uncommitted despite the excitement i should have been feeling for getting accepted into the masters program at the university of virginia',
'i like to have the same breathless feeling as a reader eager to see what will happen next',
'i jest i feel grumpy tired and pre menstrual which i probably am but then again its only been a week and im about as fit as a walrus on vacation for the summer',
'i don t feel particularly agitated',
'i feel beautifully emotional knowing that these women of whom i knew just a handful were holding me and my baba on our journey',
'i pay attention it deepens into a feeling of being invaded and helpless',
'i just feel extremely comfortable with the group of people that i dont even need to hide myself',
'i find myself in the odd position of feeling supportive of',
'i was feeling as heartbroken as im sure katniss was',
'i feel a little mellow today',
'i feel like my only role now would be to tear your sails with my pessimism and discontent',
'i feel just bcoz a fight we get mad to each other n u wanna make a publicity n let the world knows about our fight',
'i feel like reds and purples are just so rich and kind of perfect']
# Positive Sentiment = 1
# Negative Sentiment = 0
ground_truth = [ 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1]
# Sample prediction
predicted = [ 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1]
In [2]:
Copied!
def count_tp_fp_fn(ground_truth_list: list, predicted_list: list, positive_class) -> tuple:
true_positives = 0
false_positives = 0
false_negatives = 0
for i in range(len(ground_truth_list)):
if ground_truth_list[i] == positive_class:
if predicted_list[i] == positive_class:
true_positives += 1
else:
false_negatives += 1
elif predicted_list[i] == positive_class:
false_positives += 1
return true_positives, false_positives, false_negatives
def count_tp_fp_fn(ground_truth_list: list, predicted_list: list, positive_class) -> tuple:
true_positives = 0
false_positives = 0
false_negatives = 0
for i in range(len(ground_truth_list)):
if ground_truth_list[i] == positive_class:
if predicted_list[i] == positive_class:
true_positives += 1
else:
false_negatives += 1
elif predicted_list[i] == positive_class:
false_positives += 1
return true_positives, false_positives, false_negatives
In [3]:
Copied!
# Sample results
positive_class = 1
true_positives, false_positives, false_negatives = count_tp_fp_fn(ground_truth, predicted, positive_class)
print(f"True Positives: {true_positives}")
print(f"False Positives: {false_positives}")
print(f"False Negatives: {false_negatives}")
# Sample results
positive_class = 1
true_positives, false_positives, false_negatives = count_tp_fp_fn(ground_truth, predicted, positive_class)
print(f"True Positives: {true_positives}")
print(f"False Positives: {false_positives}")
print(f"False Negatives: {false_negatives}")
True Positives: 5 False Positives: 3 False Negatives: 2
F1 Score¶
$precision = \frac{TP}{TP + FP}$
In [4]:
Copied!
precision = true_positives / (true_positives + false_positives)
print(f"Precision: {precision:.3f}")
precision = true_positives / (true_positives + false_positives)
print(f"Precision: {precision:.3f}")
Precision: 0.625
$recall = \frac{TP}{TP+FN}$
In [5]:
Copied!
recall = true_positives / (true_positives + false_negatives)
print(f"Recall: {recall:.3f}")
recall = true_positives / (true_positives + false_negatives)
print(f"Recall: {recall:.3f}")
Recall: 0.714
In [6]:
Copied!
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
Precision: 0.625 Recall: 0.714
First Method: using precision and recall
$F_1 = \cfrac{2}{\cfrac{1}{precision}+\cfrac{1}{recall}}$
In [7]:
Copied!
f1_score_a = 2 / ((1 / precision) + (1 / recall))
print(f"F1 Score calculated using precision and recall: {f1_score_a:.3f}")
f1_score_a = 2 / ((1 / precision) + (1 / recall))
print(f"F1 Score calculated using precision and recall: {f1_score_a:.3f}")
F1 Score calculated using precision and recall: 0.667
Second method using TP, FP and FN
$F_1 = \cfrac{TP}{TP + \cfrac{FP+FN}{2}}$
In [8]:
Copied!
f1_score_b = true_positives / (true_positives + (false_positives + false_negatives) / 2)
print(f"F1 Score calculated using TP FP and FN: {f1_score_b:.3f}")
f1_score_b = true_positives / (true_positives + (false_positives + false_negatives) / 2)
print(f"F1 Score calculated using TP FP and FN: {f1_score_b:.3f}")
F1 Score calculated using TP FP and FN: 0.667
In [9]:
Copied!
import math
print(f"The two f1 scores are equal? {f1_score_a == f1_score_b}")
print(f"The two f1 scores are close up to 15 decimal places? {math.isclose(f1_score_a, f1_score_b, abs_tol=0.0000000000000001)}")
print(f1_score_a)
print(f1_score_b)
import math
print(f"The two f1 scores are equal? {f1_score_a == f1_score_b}")
print(f"The two f1 scores are close up to 15 decimal places? {math.isclose(f1_score_a, f1_score_b, abs_tol=0.0000000000000001)}")
print(f1_score_a)
print(f1_score_b)
The two f1 scores are equal? True The two f1 scores are close up to 15 decimal places? True 0.6666666666666666 0.6666666666666666
Multiclass¶
- Dataset used for this sample CARER: Contextualized Affect Representations for Emotion Recognition by Elvis Saravia, Hsien-Chi Toby Liu, Yen-Hao Huang, Junlin Wu, and Yi-Shin Chen. In Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, pages 3687-3697, Brussels, Belgium, October-November 2018. Association for Computational Linguistics.
In [10]:
Copied!
# from https://github.com/dair-ai/emotion_dataset
multi_class_texts = ['im feeling rather rotten so im not very ambitious right now',
'im updating my blog because i feel shitty',
'i never make her separate from me because i don t ever want her to feel like i m ashamed with her',
'i left with my bouquet of red and yellow tulips under my arm feeling slightly more optimistic than when i arrived',
'i was feeling a little vain when i did this one',
'i cant walk into a shop anywhere where i do not feel uncomfortable',
'i felt anger when at the end of a telephone call',
'i explain why i clung to a relationship with a boy who was in many ways immature and uncommitted despite the excitement i should have been feeling for getting accepted into the masters program at the university of virginia',
'i like to have the same breathless feeling as a reader eager to see what will happen next',
'i jest i feel grumpy tired and pre menstrual which i probably am but then again its only been a week and im about as fit as a walrus on vacation for the summer',
'i don t feel particularly agitated',
'i feel beautifully emotional knowing that these women of whom i knew just a handful were holding me and my baba on our journey',
'i pay attention it deepens into a feeling of being invaded and helpless',
'i just feel extremely comfortable with the group of people that i dont even need to hide myself',
'i find myself in the odd position of feeling supportive of',
'i was feeling as heartbroken as im sure katniss was',
'i feel a little mellow today',
'i feel like my only role now would be to tear your sails with my pessimism and discontent',
'i feel just bcoz a fight we get mad to each other n u wanna make a publicity n let the world knows about our fight',
'i feel like reds and purples are just so rich and kind of perfect']
# 0: 'sadness'
# 1: 'joy'
# 2: 'love'
# 3: 'anger'
# 4: 'fear'
# 5: 'surprise'
ground_truth_multi = [0, 0, 0, 1, 0, 4, 3, 1, 1, 3, 4, 0, 4, 1, 2, 0, 1, 0, 3, 1]
predicted_multi = [0, 1, 2, 1, 2, 4, 3, 3, 1, 4, 4, 0, 4, 1, 2, 0, 1, 0, 3, 1]
# from https://github.com/dair-ai/emotion_dataset
multi_class_texts = ['im feeling rather rotten so im not very ambitious right now',
'im updating my blog because i feel shitty',
'i never make her separate from me because i don t ever want her to feel like i m ashamed with her',
'i left with my bouquet of red and yellow tulips under my arm feeling slightly more optimistic than when i arrived',
'i was feeling a little vain when i did this one',
'i cant walk into a shop anywhere where i do not feel uncomfortable',
'i felt anger when at the end of a telephone call',
'i explain why i clung to a relationship with a boy who was in many ways immature and uncommitted despite the excitement i should have been feeling for getting accepted into the masters program at the university of virginia',
'i like to have the same breathless feeling as a reader eager to see what will happen next',
'i jest i feel grumpy tired and pre menstrual which i probably am but then again its only been a week and im about as fit as a walrus on vacation for the summer',
'i don t feel particularly agitated',
'i feel beautifully emotional knowing that these women of whom i knew just a handful were holding me and my baba on our journey',
'i pay attention it deepens into a feeling of being invaded and helpless',
'i just feel extremely comfortable with the group of people that i dont even need to hide myself',
'i find myself in the odd position of feeling supportive of',
'i was feeling as heartbroken as im sure katniss was',
'i feel a little mellow today',
'i feel like my only role now would be to tear your sails with my pessimism and discontent',
'i feel just bcoz a fight we get mad to each other n u wanna make a publicity n let the world knows about our fight',
'i feel like reds and purples are just so rich and kind of perfect']
# 0: 'sadness'
# 1: 'joy'
# 2: 'love'
# 3: 'anger'
# 4: 'fear'
# 5: 'surprise'
ground_truth_multi = [0, 0, 0, 1, 0, 4, 3, 1, 1, 3, 4, 0, 4, 1, 2, 0, 1, 0, 3, 1]
predicted_multi = [0, 1, 2, 1, 2, 4, 3, 3, 1, 4, 4, 0, 4, 1, 2, 0, 1, 0, 3, 1]
In [11]:
Copied!
# Sample Results
n_class = 5
multiclass_results_list = [count_tp_fp_fn(ground_truth_multi, predicted_multi, i) for i in range(n_class)]
true_positives_list = [class_result[0] for class_result in multiclass_results_list]
false_positives_list = [class_result[1] for class_result in multiclass_results_list]
false_negatives_list = [class_result[2] for class_result in multiclass_results_list]
# Sample Results
n_class = 5
multiclass_results_list = [count_tp_fp_fn(ground_truth_multi, predicted_multi, i) for i in range(n_class)]
true_positives_list = [class_result[0] for class_result in multiclass_results_list]
false_positives_list = [class_result[1] for class_result in multiclass_results_list]
false_negatives_list = [class_result[2] for class_result in multiclass_results_list]
In [12]:
Copied!
true_positives_list
true_positives_list
Out[12]:
[4, 5, 1, 2, 3]
In [13]:
Copied!
false_positives_list
false_positives_list
Out[13]:
[0, 1, 2, 1, 1]
In [14]:
Copied!
false_negatives_list
false_negatives_list
Out[14]:
[3, 1, 0, 1, 0]
MacroF1¶
$Macro F_1 = \cfrac{\sum_{i=1}^{n} F1 Score_i}{n}$
Example for 2 classes
In [15]:
Copied!
f1_score_0 = true_positives_list[0] / (true_positives_list[0] + (false_positives_list[0] + false_negatives_list[0]) / 2)
f1_score_1 = true_positives_list[1] / (true_positives_list[1] + (false_positives_list[1] + false_negatives_list[1]) / 2)
f1_score_0 = true_positives_list[0] / (true_positives_list[0] + (false_positives_list[0] + false_negatives_list[0]) / 2)
f1_score_1 = true_positives_list[1] / (true_positives_list[1] + (false_positives_list[1] + false_negatives_list[1]) / 2)
In [16]:
Copied!
macro_f1_score = (f1_score_0 + f1_score_1) / 2
print(macro_f1_score)
macro_f1_score = (f1_score_0 + f1_score_1) / 2
print(macro_f1_score)
0.7803030303030303
Example for all classes
In [17]:
Copied!
f1_scores = [true_positives_list[i] / (true_positives_list[i] + (false_positives_list[i] + false_negatives_list[i]) / 2) for i in range(n_class)]
f1_scores = [true_positives_list[i] / (true_positives_list[i] + (false_positives_list[i] + false_negatives_list[i]) / 2) for i in range(n_class)]
In [18]:
Copied!
print(f1_scores)
print(f1_scores)
[0.7272727272727273, 0.8333333333333334, 0.5, 0.6666666666666666, 0.8571428571428571]
In [19]:
Copied!
macro_f1_score = sum(f1_scores) / len(f1_scores)
print(macro_f1_score)
macro_f1_score = sum(f1_scores) / len(f1_scores)
print(macro_f1_score)
0.7168831168831169
In [20]:
Copied!
from statistics import mean
from statistics import mean
In [21]:
Copied!
mean(f1_scores)
mean(f1_scores)
Out[21]:
0.7168831168831169
MicroF1¶
$Micro F_1 = \cfrac{\sum_{i=1}^{n} TP_i}{\sum_{i=1}^{n} TP_i + \cfrac{\sum_{i=1}^{n} FP_i + \sum_{i=1}^{n} FN_i}{2}}$
In [22]:
Copied!
micro_f1_score = sum(true_positives_list) / (sum(true_positives_list) + ((sum(false_positives_list) + sum(false_negatives_list))/2))
micro_f1_score = sum(true_positives_list) / (sum(true_positives_list) + ((sum(false_positives_list) + sum(false_negatives_list))/2))
In [23]:
Copied!
print(micro_f1_score)
print(micro_f1_score)
0.75
In [24]:
Copied!
tp_sum = sum(true_positives_list)
fp_sum = sum(false_positives_list)
fn_sum = sum(false_negatives_list)
tp_sum = sum(true_positives_list)
fp_sum = sum(false_positives_list)
fn_sum = sum(false_negatives_list)
In [25]:
Copied!
micro_f1_score = tp_sum / (tp_sum + (fp_sum + fn_sum) / 2)
micro_f1_score = tp_sum / (tp_sum + (fp_sum + fn_sum) / 2)
In [26]:
Copied!
print(micro_f1_score)
print(micro_f1_score)
0.75
Scikit Learn¶
In [27]:
Copied!
!pip install -U scikit-learn
!pip install -U scikit-learn
Requirement already satisfied: scikit-learn in ./venv/lib/python3.9/site-packages (1.3.0) Collecting scikit-learn Downloading scikit_learn-1.3.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.9 MB) |████████████████████████████████| 10.9 MB 4.2 MB/s eta 0:00:01 Requirement already satisfied: joblib>=1.1.1 in ./venv/lib/python3.9/site-packages (from scikit-learn) (1.3.2) Requirement already satisfied: scipy>=1.5.0 in ./venv/lib/python3.9/site-packages (from scikit-learn) (1.11.2) Requirement already satisfied: numpy<2.0,>=1.17.3 in ./venv/lib/python3.9/site-packages (from scikit-learn) (1.25.2) Requirement already satisfied: threadpoolctl>=2.0.0 in ./venv/lib/python3.9/site-packages (from scikit-learn) (3.2.0) Installing collected packages: scikit-learn Attempting uninstall: scikit-learn Found existing installation: scikit-learn 1.3.0 Uninstalling scikit-learn-1.3.0: Successfully uninstalled scikit-learn-1.3.0 Successfully installed scikit-learn-1.3.1
In [28]:
Copied!
from sklearn.metrics import f1_score
from sklearn.metrics import f1_score
In [29]:
Copied!
# Per class
f1_score(ground_truth_multi, predicted_multi, average=None)
# Per class
f1_score(ground_truth_multi, predicted_multi, average=None)
Out[29]:
array([0.72727273, 0.83333333, 0.5 , 0.66666667, 0.85714286])
In [30]:
Copied!
# Macro
f1_score(ground_truth_multi, predicted_multi, average='macro')
# Macro
f1_score(ground_truth_multi, predicted_multi, average='macro')
Out[30]:
0.7168831168831169
In [31]:
Copied!
# Micro
f1_score(ground_truth_multi, predicted_multi, average='micro')
# Micro
f1_score(ground_truth_multi, predicted_multi, average='micro')
Out[31]:
0.75