In [ ]:

Copied!





# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Evaluate Classification¶


Author(s)	Renato Leite (renatoleite@), Egon Soares (egon@)
Last updated	09/05/2023

Per Class¶

Dataset used for this sample CARER: Contextualized Affect Representations for Emotion Recognition by Elvis Saravia, Hsien-Chi Toby Liu, Yen-Hao Huang, Junlin Wu, and Yi-Shin Chen. In Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, pages 3687-3697, Brussels, Belgium, October-November 2018. Association for Computational Linguistics.

In [1]:

Copied!





# from https://github.com/dair-ai/emotion_dataset - modified to binary classification
texts = [
  'i left with my bouquet of red and yellow tulips under my arm feeling slightly more optimistic than when i arrived',
  'i explain why i clung to a relationship with a boy who was in many ways immature and uncommitted despite the excitement i should have been feeling for getting accepted into the masters program at the university of virginia',
  'i like to have the same breathless feeling as a reader eager to see what will happen next',
  'i jest i feel grumpy tired and pre menstrual which i probably am but then again its only been a week and im about as fit as a walrus on vacation for the summer',
  'i don t feel particularly agitated',
  'i feel beautifully emotional knowing that these women of whom i knew just a handful were holding me and my baba on our journey',
  'i pay attention it deepens into a feeling of being invaded and helpless',
  'i just feel extremely comfortable with the group of people that i dont even need to hide myself',
  'i find myself in the odd position of feeling supportive of',
  'i was feeling as heartbroken as im sure katniss was',
  'i feel a little mellow today',
  'i feel like my only role now would be to tear your sails with my pessimism and discontent',
  'i feel just bcoz a fight we get mad to each other n u wanna make a publicity n let the world knows about our fight',
  'i feel like reds and purples are just so rich and kind of perfect']

# Positive Sentiment = 1
# Negative Sentiment = 0
ground_truth = [ 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1]

# Sample prediction
predicted = [ 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1]
# from https://github.com/dair-ai/emotion_dataset - modified to binary classification
texts = [
  'i left with my bouquet of red and yellow tulips under my arm feeling slightly more optimistic than when i arrived',
  'i explain why i clung to a relationship with a boy who was in many ways immature and uncommitted despite the excitement i should have been feeling for getting accepted into the masters program at the university of virginia',
  'i like to have the same breathless feeling as a reader eager to see what will happen next',
  'i jest i feel grumpy tired and pre menstrual which i probably am but then again its only been a week and im about as fit as a walrus on vacation for the summer',
  'i don t feel particularly agitated',
  'i feel beautifully emotional knowing that these women of whom i knew just a handful were holding me and my baba on our journey',
  'i pay attention it deepens into a feeling of being invaded and helpless',
  'i just feel extremely comfortable with the group of people that i dont even need to hide myself',
  'i find myself in the odd position of feeling supportive of',
  'i was feeling as heartbroken as im sure katniss was',
  'i feel a little mellow today',
  'i feel like my only role now would be to tear your sails with my pessimism and discontent',
  'i feel just bcoz a fight we get mad to each other n u wanna make a publicity n let the world knows about our fight',
  'i feel like reds and purples are just so rich and kind of perfect']

# Positive Sentiment = 1
# Negative Sentiment = 0
ground_truth = [ 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1]

# Sample prediction
predicted = [ 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1]

In [2]:

Copied!





def count_tp_fp_fn(ground_truth_list: list, predicted_list: list, positive_class) -> tuple:
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    
    for i in range(len(ground_truth_list)):
        if ground_truth_list[i] == positive_class:
            if predicted_list[i] == positive_class:
                true_positives += 1
            else:
                false_negatives += 1
        elif predicted_list[i] == positive_class:
            false_positives += 1

    return true_positives, false_positives, false_negatives
def count_tp_fp_fn(ground_truth_list: list, predicted_list: list, positive_class) -> tuple:
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    
    for i in range(len(ground_truth_list)):
        if ground_truth_list[i] == positive_class:
            if predicted_list[i] == positive_class:
                true_positives += 1
            else:
                false_negatives += 1
        elif predicted_list[i] == positive_class:
            false_positives += 1

    return true_positives, false_positives, false_negatives

In [3]:

Copied!





# Sample results
positive_class = 1

true_positives, false_positives, false_negatives = count_tp_fp_fn(ground_truth, predicted, positive_class)

print(f"True Positives: {true_positives}")
print(f"False Positives: {false_positives}")
print(f"False Negatives: {false_negatives}")
# Sample results
positive_class = 1

true_positives, false_positives, false_negatives = count_tp_fp_fn(ground_truth, predicted, positive_class)

print(f"True Positives: {true_positives}")
print(f"False Positives: {false_positives}")
print(f"False Negatives: {false_negatives}")

True Positives: 5
False Positives: 3
False Negatives: 2

F1 Score¶

$precision = \frac{TP}{TP + FP}$

In [4]:

Copied!

precision = true_positives / (true_positives + false_positives)
print(f"Precision: {precision:.3f}")
precision = true_positives / (true_positives + false_positives)
print(f"Precision: {precision:.3f}")

Precision: 0.625

$recall = \frac{TP}{TP+FN}$

In [5]:

Copied!

recall = true_positives / (true_positives + false_negatives)
print(f"Recall: {recall:.3f}")
recall = true_positives / (true_positives + false_negatives)
print(f"Recall: {recall:.3f}")

Recall: 0.714

In [6]:

Copied!

print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")

Precision: 0.625
Recall: 0.714

First Method: using precision and recall

$F_1 = \cfrac{2}{\cfrac{1}{precision}+\cfrac{1}{recall}}$

In [7]:

Copied!

f1_score_a = 2 / ((1 / precision) + (1 / recall))
print(f"F1 Score calculated using precision and recall: {f1_score_a:.3f}")
f1_score_a = 2 / ((1 / precision) + (1 / recall))
print(f"F1 Score calculated using precision and recall: {f1_score_a:.3f}")

F1 Score calculated using precision and recall: 0.667

Second method using TP, FP and FN

$F_1 = \cfrac{TP}{TP + \cfrac{FP+FN}{2}}$

In [8]:

Copied!

f1_score_b = true_positives / (true_positives + (false_positives + false_negatives) / 2)
print(f"F1 Score calculated using TP FP and FN: {f1_score_b:.3f}")
f1_score_b = true_positives / (true_positives + (false_positives + false_negatives) / 2)
print(f"F1 Score calculated using TP FP and FN: {f1_score_b:.3f}")

F1 Score calculated using TP FP and FN: 0.667

In [9]:

Copied!





import math
print(f"The two f1 scores are equal? {f1_score_a == f1_score_b}")
print(f"The two f1 scores are close up to 15 decimal places? {math.isclose(f1_score_a, f1_score_b, abs_tol=0.0000000000000001)}")
print(f1_score_a)
print(f1_score_b)
import math
print(f"The two f1 scores are equal? {f1_score_a == f1_score_b}")
print(f"The two f1 scores are close up to 15 decimal places? {math.isclose(f1_score_a, f1_score_b, abs_tol=0.0000000000000001)}")
print(f1_score_a)
print(f1_score_b)

The two f1 scores are equal? True
The two f1 scores are close up to 15 decimal places? True
0.6666666666666666
0.6666666666666666

Multiclass¶

Dataset used for this sample CARER: Contextualized Affect Representations for Emotion Recognition by Elvis Saravia, Hsien-Chi Toby Liu, Yen-Hao Huang, Junlin Wu, and Yi-Shin Chen. In Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, pages 3687-3697, Brussels, Belgium, October-November 2018. Association for Computational Linguistics.

In [10]:

Copied!





# from https://github.com/dair-ai/emotion_dataset
multi_class_texts = ['im feeling rather rotten so im not very ambitious right now',
  'im updating my blog because i feel shitty',
  'i never make her separate from me because i don t ever want her to feel like i m ashamed with her',
  'i left with my bouquet of red and yellow tulips under my arm feeling slightly more optimistic than when i arrived',
  'i was feeling a little vain when i did this one',
  'i cant walk into a shop anywhere where i do not feel uncomfortable',
  'i felt anger when at the end of a telephone call',
  'i explain why i clung to a relationship with a boy who was in many ways immature and uncommitted despite the excitement i should have been feeling for getting accepted into the masters program at the university of virginia',
  'i like to have the same breathless feeling as a reader eager to see what will happen next',
  'i jest i feel grumpy tired and pre menstrual which i probably am but then again its only been a week and im about as fit as a walrus on vacation for the summer',
  'i don t feel particularly agitated',
  'i feel beautifully emotional knowing that these women of whom i knew just a handful were holding me and my baba on our journey',
  'i pay attention it deepens into a feeling of being invaded and helpless',
  'i just feel extremely comfortable with the group of people that i dont even need to hide myself',
  'i find myself in the odd position of feeling supportive of',
  'i was feeling as heartbroken as im sure katniss was',
  'i feel a little mellow today',
  'i feel like my only role now would be to tear your sails with my pessimism and discontent',
  'i feel just bcoz a fight we get mad to each other n u wanna make a publicity n let the world knows about our fight',
  'i feel like reds and purples are just so rich and kind of perfect']


# 0: 'sadness'
# 1: 'joy'
# 2: 'love'
# 3: 'anger'
# 4: 'fear'
# 5: 'surprise'
ground_truth_multi = [0, 0, 0, 1, 0, 4, 3, 1, 1, 3, 4, 0, 4, 1, 2, 0, 1, 0, 3, 1]
predicted_multi =    [0, 1, 2, 1, 2, 4, 3, 3, 1, 4, 4, 0, 4, 1, 2, 0, 1, 0, 3, 1]
# from https://github.com/dair-ai/emotion_dataset
multi_class_texts = ['im feeling rather rotten so im not very ambitious right now',
  'im updating my blog because i feel shitty',
  'i never make her separate from me because i don t ever want her to feel like i m ashamed with her',
  'i left with my bouquet of red and yellow tulips under my arm feeling slightly more optimistic than when i arrived',
  'i was feeling a little vain when i did this one',
  'i cant walk into a shop anywhere where i do not feel uncomfortable',
  'i felt anger when at the end of a telephone call',
  'i explain why i clung to a relationship with a boy who was in many ways immature and uncommitted despite the excitement i should have been feeling for getting accepted into the masters program at the university of virginia',
  'i like to have the same breathless feeling as a reader eager to see what will happen next',
  'i jest i feel grumpy tired and pre menstrual which i probably am but then again its only been a week and im about as fit as a walrus on vacation for the summer',
  'i don t feel particularly agitated',
  'i feel beautifully emotional knowing that these women of whom i knew just a handful were holding me and my baba on our journey',
  'i pay attention it deepens into a feeling of being invaded and helpless',
  'i just feel extremely comfortable with the group of people that i dont even need to hide myself',
  'i find myself in the odd position of feeling supportive of',
  'i was feeling as heartbroken as im sure katniss was',
  'i feel a little mellow today',
  'i feel like my only role now would be to tear your sails with my pessimism and discontent',
  'i feel just bcoz a fight we get mad to each other n u wanna make a publicity n let the world knows about our fight',
  'i feel like reds and purples are just so rich and kind of perfect']


# 0: 'sadness'
# 1: 'joy'
# 2: 'love'
# 3: 'anger'
# 4: 'fear'
# 5: 'surprise'
ground_truth_multi = [0, 0, 0, 1, 0, 4, 3, 1, 1, 3, 4, 0, 4, 1, 2, 0, 1, 0, 3, 1]
predicted_multi =    [0, 1, 2, 1, 2, 4, 3, 3, 1, 4, 4, 0, 4, 1, 2, 0, 1, 0, 3, 1]

In [11]:

Copied!





# Sample Results
n_class = 5
multiclass_results_list = [count_tp_fp_fn(ground_truth_multi, predicted_multi, i) for i in range(n_class)]
true_positives_list = [class_result[0] for class_result in multiclass_results_list]
false_positives_list = [class_result[1] for class_result in multiclass_results_list]
false_negatives_list = [class_result[2] for class_result in multiclass_results_list]
# Sample Results
n_class = 5
multiclass_results_list = [count_tp_fp_fn(ground_truth_multi, predicted_multi, i) for i in range(n_class)]
true_positives_list = [class_result[0] for class_result in multiclass_results_list]
false_positives_list = [class_result[1] for class_result in multiclass_results_list]
false_negatives_list = [class_result[2] for class_result in multiclass_results_list]

In [12]:

Copied!

true_positives_list
true_positives_list

Out[12]:

[4, 5, 1, 2, 3]

In [13]:

Copied!

false_positives_list
false_positives_list

Out[13]:

[0, 1, 2, 1, 1]

In [14]:

Copied!

false_negatives_list
false_negatives_list

Out[14]:

[3, 1, 0, 1, 0]

MacroF1¶

$Macro F_1 = \cfrac{\sum_{i=1}^{n} F1 Score_i}{n}$

Example for 2 classes

In [15]:

Copied!

f1_score_0 = true_positives_list[0] / (true_positives_list[0] + (false_positives_list[0] + false_negatives_list[0]) / 2)
f1_score_1 = true_positives_list[1] / (true_positives_list[1] + (false_positives_list[1] + false_negatives_list[1]) / 2)
f1_score_0 = true_positives_list[0] / (true_positives_list[0] + (false_positives_list[0] + false_negatives_list[0]) / 2)
f1_score_1 = true_positives_list[1] / (true_positives_list[1] + (false_positives_list[1] + false_negatives_list[1]) / 2)

In [16]:

Copied!

macro_f1_score = (f1_score_0 + f1_score_1) / 2

print(macro_f1_score)
macro_f1_score = (f1_score_0 + f1_score_1) / 2

print(macro_f1_score)

0.7803030303030303

Example for all classes

In [17]:

Copied!

f1_scores = [true_positives_list[i] / (true_positives_list[i] + (false_positives_list[i] + false_negatives_list[i]) / 2) for i in range(n_class)]
f1_scores = [true_positives_list[i] / (true_positives_list[i] + (false_positives_list[i] + false_negatives_list[i]) / 2) for i in range(n_class)]

In [18]:

Copied!

print(f1_scores)
print(f1_scores)

[0.7272727272727273, 0.8333333333333334, 0.5, 0.6666666666666666, 0.8571428571428571]

In [19]:

Copied!

macro_f1_score = sum(f1_scores) / len(f1_scores)

print(macro_f1_score)
macro_f1_score = sum(f1_scores) / len(f1_scores)

print(macro_f1_score)

0.7168831168831169

In [20]:

Copied!

from statistics import mean
from statistics import mean

In [21]:

Copied!

mean(f1_scores)
mean(f1_scores)

Out[21]:

0.7168831168831169

MicroF1¶

$Micro F_1 = \cfrac{\sum_{i=1}^{n} TP_i}{\sum_{i=1}^{n} TP_i + \cfrac{\sum_{i=1}^{n} FP_i + \sum_{i=1}^{n} FN_i}{2}}$

In [22]:

Copied!

micro_f1_score = sum(true_positives_list) / (sum(true_positives_list) + ((sum(false_positives_list) + sum(false_negatives_list))/2))
micro_f1_score = sum(true_positives_list) / (sum(true_positives_list) + ((sum(false_positives_list) + sum(false_negatives_list))/2))

In [23]:

Copied!

print(micro_f1_score)
print(micro_f1_score)

0.75

In [24]:

Copied!

tp_sum = sum(true_positives_list)
fp_sum = sum(false_positives_list)
fn_sum = sum(false_negatives_list)
tp_sum = sum(true_positives_list)
fp_sum = sum(false_positives_list)
fn_sum = sum(false_negatives_list)

In [25]:

Copied!

micro_f1_score = tp_sum / (tp_sum + (fp_sum + fn_sum) / 2)
micro_f1_score = tp_sum / (tp_sum + (fp_sum + fn_sum) / 2)

In [26]:

Copied!

print(micro_f1_score)
print(micro_f1_score)

0.75

Scikit Learn¶

In [27]:

Copied!

!pip install -U scikit-learn
!pip install -U scikit-learn

Requirement already satisfied: scikit-learn in ./venv/lib/python3.9/site-packages (1.3.0)
Collecting scikit-learn
  Downloading scikit_learn-1.3.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.9 MB)
     |████████████████████████████████| 10.9 MB 4.2 MB/s eta 0:00:01
Requirement already satisfied: joblib>=1.1.1 in ./venv/lib/python3.9/site-packages (from scikit-learn) (1.3.2)
Requirement already satisfied: scipy>=1.5.0 in ./venv/lib/python3.9/site-packages (from scikit-learn) (1.11.2)
Requirement already satisfied: numpy<2.0,>=1.17.3 in ./venv/lib/python3.9/site-packages (from scikit-learn) (1.25.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in ./venv/lib/python3.9/site-packages (from scikit-learn) (3.2.0)
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.3.0
    Uninstalling scikit-learn-1.3.0:
      Successfully uninstalled scikit-learn-1.3.0
Successfully installed scikit-learn-1.3.1

In [28]:

Copied!

from sklearn.metrics import f1_score
from sklearn.metrics import f1_score

In [29]:

Copied!

# Per class
f1_score(ground_truth_multi, predicted_multi, average=None)
# Per class
f1_score(ground_truth_multi, predicted_multi, average=None)

Out[29]:

array([0.72727273, 0.83333333, 0.5       , 0.66666667, 0.85714286])

In [30]:

Copied!

# Macro
f1_score(ground_truth_multi, predicted_multi, average='macro')
# Macro
f1_score(ground_truth_multi, predicted_multi, average='macro')

Out[30]:

0.7168831168831169

In [31]:

Copied!

# Micro
f1_score(ground_truth_multi, predicted_multi, average='micro')
# Micro
f1_score(ground_truth_multi, predicted_multi, average='micro')

Out[31]:

0.75