mindnlp.common.metrics 源代码

# Copyright 2022 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
""""Functions for Metrics"""

import sys
import math
import string
from collections.abc import Iterable
from collections import Counter
import re
import numpy as np
from mindspore import Tensor


[文档]def perplexity(preds, labels, ignore_label=None): r""" Calculates the perplexity. Perplexity is a measure of how well a probabilibity model predicts a sample. A low perplexity indicates the model is good at predicting the sample. The function is shown as follows: .. math:: PP(W)=P(w_{1}w_{2}...w_{N})^{-\frac{1}{N}}=\sqrt[N]{\frac{1}{P(w_{1}w_{2}...w_{N})}} where :math:`w` represents words in corpus. Args: preds (Union[Tensor, list, np.ndarray]): Predicted value. `preds` is a list of floating numbers in range :math:`[0, 1]` and the shape of `preds` is :math:`(N, C)` in most cases (not strictly), where :math:`N` is the number of cases and :math:`C` is the number of categories. labels (Union[Tensor, list, np.ndarray]): Ground truth. `labels` must be in one-hot format that shape is :math:`(N, C)`, or can be transformed to one-hot format that shape is :math:`(N,)`. ignore_label (Union[int, None]): Index of an invalid label to be ignored when counting. If set to `None`, it means there's no invalid label. Default: None. Returns: - **ppl** (float) - The computed result. Raises: RuntimeError: If `preds` and `labels` have different lengths. RuntimeError: If `pred` and `label` have different shapes. RuntimeError: If the sample size is 0. Examples: >>> import numpy as np >>> import mindspore >>> from mindspore import Tensor >>> from mindnlp.common.metrics import perplexity >>> preds = Tensor(np.array([[0.2, 0.5], [0.3, 0.1], [0.9, 0.6]]), mindspore.float32) >>> labels = Tensor(np.array([1, 0, 1]), mindspore.int32) >>> ppl = perplexity(preds, labels, ignore_label=None) >>> print(ppl) 2.231443166940565 """ if ignore_label is not None: ignore_label = _check_value_type("ignore_label", ignore_label, [int]) preds = _check_value_type("preds", preds, [Tensor, list, np.ndarray]) labels = _check_value_type("labels", labels, [Tensor, list, np.ndarray]) y_pred = [_convert_data_type(preds)] y_true = [_convert_data_type(labels)] if len(y_pred) != len(y_true): raise RuntimeError(f'`preds` and `labels` should have the same length, but got `preds` ' f'length {len(y_pred)}, `labels` length {len(y_true)})') sum_cross_entropy = 0.0 sum_word_num = 0 cross_entropy = 0. word_num = 0 for label, pred in zip(y_true, y_pred): if pred.ndim == label.ndim and _check_onehot_data(label): label = label.argmax(axis=1) if label.size != pred.size / pred.shape[-1]: raise RuntimeError(f'`preds` and `labels` should have the same shape, but got `preds` ' f'shape {pred.shape}, label shape {label.shape}.') label = label.reshape((label.size,)) label_expand = label.astype(int) label_expand = np.expand_dims(label_expand, axis=1) first_indices = np.arange(label_expand.shape[0])[:, None] pred = np.squeeze(pred[first_indices, label_expand]) if ignore_label is not None: ignore = (label == ignore_label).astype(pred.dtype) word_num -= np.sum(ignore) pred = pred * (1 - ignore) + ignore cross_entropy -= np.sum(np.log(np.maximum(1e-10, pred))) word_num += pred.size sum_cross_entropy += cross_entropy sum_word_num += word_num if sum_word_num == 0: raise RuntimeError(f'Perplexity can not be calculated, because the number of samples is ' f'{0}') ppl = math.exp(sum_cross_entropy / sum_word_num) return ppl
[文档]def bleu(cand, ref_list, n_size=4, weights=None): r""" Calculates the BLEU score. BLEU (bilingual evaluation understudy) is a metric for evaluating the quality of text translated by machine. It uses a modified form of precision to compare a candidate translation against multiple reference translations. The function is shown as follows: .. math:: BP & = \begin{cases} 1, & \text{if }c>r \\ e_{1-r/c}, & \text{if }c\leq r \end{cases} BLEU & = BP\exp(\sum_{n=1}^N w_{n} \log{p_{n}}) where `c` is the length of candidate sentence, and `r` is the length of reference sentence. Args: cand (list): A list of tokenized candidate sentences. ref_list (list): A list of lists of tokenized true sentences. n_size (int): N_gram value ranges from 1 to 4. Default: 4. weights (Union[list, None]): Weights of precision of each gram. Defaults to None. Returns: - **bleu_score** (float) - The computed result. Raises: ValueError: If the value range of `n_size` is not from 1 to 4. ValueError: If the lengths of `cand` and `ref_list` are not equal. ValueError: If the lengths of `weights` is not equal to `n_size`. Example: >>> from mindnlp.common.metrics import bleu >>> cand = [["The", "cat", "The", "cat", "on", "the", "mat"]] >>> ref_list = [[["The", "cat", "is", "on", "the", "mat"], ["There", "is", "a", "cat", "on", "the", "mat"]]] >>> bleu_score = bleu(cand, ref_list) >>> print(bleu_score) 0.46713797772820015 """ n_size = _check_value_type("n_size", n_size, [int]) if n_size > 4 or n_size < 1: raise ValueError(f'`n_size` should range from 1 to 4, but got {n_size}') cand = _check_value_type("cand", cand, list) ref_list = _check_value_type("ref_list", ref_list, list) if len(cand) != len(ref_list): raise ValueError(f'`cand` and `ref_list` should be equal in length, but got {len(cand)}' f', {len(ref_list)}') numerator = np.zeros(n_size) denominator = np.zeros(n_size) precision_scores = np.zeros(n_size) bp_c = 0.0 bp_r = 0.0 cand_len = 0 ref_len = 0 for (candidate, references) in zip(cand, ref_list): bp_c += len(candidate) ref_len_list = [len(ref) for ref in references] ref_len_diff = [abs(len(candidate) - x) for x in ref_len_list] bp_r += ref_len_list[ref_len_diff.index(min(ref_len_diff))] candidate_counter = _count_ngram(candidate, n_size) reference_counter = Counter() for ref in references: reference_counter |= _count_ngram(ref, n_size) ngram_counter_clip = candidate_counter & reference_counter for counter_clip in ngram_counter_clip: numerator[len(counter_clip) - 1] += ngram_counter_clip[counter_clip] for counter in candidate_counter: denominator[len(counter) - 1] += candidate_counter[counter] cand_len = np.array(bp_c) ref_len = np.array(bp_r) if min(numerator) == 0.0: return np.array(0.0) precision_scores = numerator / denominator if weights is None: weights = [1 / n_size for _ in range(n_size)] if n_size != len(weights): raise ValueError("The length of `weights` should be equal to `n_size`") log_precision_scores = weights * np.log(precision_scores) geometric_mean = np.exp(np.sum(log_precision_scores)) brevity_penalty = np.array(1.0) if bp_c > bp_r else np.exp(1 - (ref_len / cand_len)) bleu_score = brevity_penalty * geometric_mean return bleu_score
[文档]def rouge_n(cand_list, ref_list, n_size=1): r""" Calculates the ROUGE-N score. ROUGE (Recall-Oriented Understudy for Gisting Evaluation) is a set of metrics used for evaluating automatic summarization and machine translation models. ROUGE-N refers to the overlap of n-grams between candidates and reference summaries. Args: cand_list (list): A list of tokenized candidate sentences. ref_list (list): A list of lists of tokenized true sentences. n_size (int): N_gram value. Default: 1. Returns: - **rougen_score** (float) - The computed result. Raises: RuntimeError: If the reference size is 0. Example: >>> from mindnlp.common.metrics import rouge_n >>> cand_list = ["the", "cat", "was", "found", "under", "the", "bed"] >>> ref_list = [["the", "cat", "was", "under", "the", "bed"]] >>> rougen_score = rouge_n(cand_list, ref_list, 2) >>> print(rougen_score) 0.8 """ cand_list = _check_value_type("cand_list", cand_list, list) ref_list = _check_value_type("ref_list", ref_list, list) n_size = _check_value_type("n_size", n_size, [int]) overlap_count = 0 ref_count = 0 cand_ngrams = _get_ngrams(cand_list, n_size) for reference in ref_list: ref_ngrams = _get_ngrams(reference, n_size) ref_count += len(ref_ngrams) # Gets the overlapping ngrams between evaluated and reference overlap_ngrams = cand_ngrams.intersection(ref_ngrams) overlap_count += len(overlap_ngrams) if ref_count == 0: RuntimeError(f'ROUGE-N can not be calculated, because the number of references is {0}') rougen_score = overlap_count / ref_count return rougen_score
[文档]def rouge_l(cand_list, ref_list, beta=1.2): r""" Calculates the ROUGE-L score. ROUGE (Recall-Oriented Understudy for Gisting Evaluation) is a set of metrics used for evaluating automatic summarization and machine translation models. ROUGE-L is calculated based on Longest Common Subsequence (LCS). The function is shown as follows: .. math:: R_{l c s}=\frac{L C S(X, Y)}{m} p_{l c s}=\frac{L C S(X, Y)}{n} F_{l c s}=\frac{\left(1+\beta^{2}\right) R_{l c s} P_{l c s}}{R_{l c s}+\beta^{2} P_{l c s}} where `X` is the candidate sentence, `Y` is the reference sentence. `m` and `n` represent the length of `X` and `Y` respectively. `LCS` means the longest common subsequence. Args: cand_list (list): A list of tokenized candidate sentence. ref_list (list): A list of lists of tokenized true sentences. beta (float): A hyperparameter to decide the weight of recall. Defaults: 1.2. Returns: - **rougel_score** (float) - The computed result. Example: >>> from mindnlp.common.metrics import rouge_l >>> cand_list = ["The","cat","The","cat","on","the","mat"] >>> ref_list = [["The","cat","is","on","the","mat"], ["There","is","a","cat","on","the","mat"]] >>> rougel_score = rouge_l(cand_list, ref_list) >>> print(rougel_score) 0.7800511508951408 """ cand_list = _check_value_type("cand_list", cand_list, list) ref_list = _check_value_type("ref_list", ref_list, list) beta = _check_value_type("beta", beta, [float]) inst_scores = [] precs, recalls = [], [] for ref in ref_list: basic_lcs = _lcs(cand_list, ref) prec = basic_lcs / len(cand_list) if cand_list is not None else 0. rec = basic_lcs / len(ref) if ref is not None else 0. precs.append(prec) recalls.append(rec) prec_max = max(precs) rec_max = max(recalls) if prec_max != 0 and rec_max != 0: score = ((1 + beta**2) * prec_max * rec_max) / \ float(rec_max + beta**2 * prec_max) else: score = 0.0 inst_scores.append(score) rougel_score = 1. * sum(inst_scores) / len(inst_scores) return rougel_score
[文档]def distinct(cand_list, n_size=2): """ Calculates the Distinct-N. Distinct-N is a metric that measures the diversity of a sentence. It focuses on the number of distinct n-gram of a sentence. The larger the number of distinct n-grams, the higher the diversity of the text. The function is shown as follows: Args: cand_list (list): A list of tokenized candidate sentence. n_size (int): N_gram value. Defaults: 2. Returns: - **distinct_score** (float) - The computed result. Example: >>> from mindnlp.common.metrics import distinct >>> cand_list = ["The", "cat", "The", "cat", "on", "the", "mat"] >>> distinct_score = distinct(cand_list) >>> print(distinct_score) 0.8333333333333334 """ cand_list = _check_value_type("cand_list", cand_list, list) n_size = _check_value_type("n_size", n_size, [int]) diff_ngram = set() count = 0.0 for i in range(0, len(cand_list) - n_size + 1): ngram = ' '.join(cand_list[i:(i + n_size)]) count += 1 diff_ngram.add(ngram) distinct_score = len(diff_ngram) / count return distinct_score
[文档]def accuracy(preds, labels): r""" Calculates the accuracy. The function is shown as follows: .. math:: \text{ACC} =\frac{\text{TP} + \text{TN}} {\text{TP} + \text{TN} + \text{FP} + \text{FN}} where `ACC` is accuracy, `TP` is the number of true posistive cases, `TN` is the number of true negative cases, `FP` is the number of false posistive cases, `FN` is the number of false negative cases. Args: preds (Union[Tensor, list, np.ndarray]): Predicted value. `preds` is a list of floating numbers in range :math:`[0, 1]` and the shape of `preds` is :math:`(N, C)` in most cases (not strictly), where :math:`N` is the number of cases and :math:`C` is the number of categories. labels (Union[Tensor, list, np.ndarray]): Ground truth. `labels` must be in one-hot format that shape is :math:`(N, C)`, or can be transformed to one-hot format that shape is :math:`(N,)`. Returns: - **acc** (float) - The computed result. Raises: RuntimeError: If the number of samples is 0. Example: >>> import numpy as np >>> import mindspore >>> from mindspore import Tensor >>> from mindnlp.common.metrics import accuracy >>> preds = Tensor(np.array([[0.2, 0.5], [0.3, 0.1], [0.9, 0.6]]), mindspore.float32) >>> labels = Tensor(np.array([1, 0, 1]), mindspore.int32) >>> acc = accuracy(preds, labels) >>> print(acc) 0.6666666666666666 """ correct_num = 0 total_num = 0 y_pred = _convert_data_type(preds) y_true = _convert_data_type(labels) if y_pred.ndim == y_true.ndim and _check_onehot_data(y_true): y_true = y_true.argmax(axis=1) _check_shape(y_pred, y_true) indices = y_pred.argmax(axis=1) result = (np.equal(indices, y_true) * 1).reshape(-1) correct_num += result.sum() total_num += result.shape[0] if total_num == 0: raise RuntimeError(f'Accuracy can not be calculated, because the number of samples is ' f'{0}. Please check whether your inputs(predicted value, true value) ' f'are empty.') acc = correct_num / total_num return acc
[文档]def precision(preds, labels): r""" Calculates the precision. Precision (also known as positive predictive value) is the actual positive proportion in the predicted positive sample. It can only be used to evaluate the precision score of binary tasks. The function is shown as follows: .. math:: \text{Precision} =\frac{\text{TP}} {\text{TP} + \text{FP}} where `TP` is the number of true posistive cases, `FP` is the number of false posistive cases. Args: preds (Union[Tensor, list, np.ndarray]): Predicted value. `preds` is a list of floating numbers in range :math:`[0, 1]` and the shape of `preds` is :math:`(N, C)` in most cases (not strictly), where :math:`N` is the number of cases and :math:`C` is the number of categories. labels (Union[Tensor, list, np.ndarray]): Ground truth. `labels` must be in one-hot format that shape is :math:`(N, C)`, or can be transformed to one-hot format that shape is :math:`(N,)`. Returns: - **prec** (np.ndarray) - The computed result. Raises: ValueError: If `preds` doesn't have the same classes number as `labels`. Example: >>> import numpy as np >>> import mindspore >>> from mindspore import Tensor >>> from mindnlp.common.metrics import precision >>> preds = Tensor(np.array([[0.2, 0.5], [0.3, 0.1], [0.9, 0.6]]), mindspore.float32) >>> labels = Tensor(np.array([1, 0, 1]), mindspore.int32) >>> prec = precision(preds, labels) >>> print(prec) [0.5 1. ] """ y_pred = _convert_data_type(preds) y_true = _convert_data_type(labels) if y_pred.ndim == y_true.ndim and _check_onehot_data(y_true): y_true = y_true.argmax(axis=1) _check_shape(y_pred, y_true) class_num = y_pred.shape[1] if y_true.max() + 1 > class_num: raise ValueError(f'`preds` should have the same classes number as `labels`, but got `preds`' f'classes {class_num}, true value classes {y_true.max() + 1}') y_true = np.eye(class_num)[y_true.reshape(-1)] indices = y_pred.argmax(axis=1).reshape(-1) y_pred = np.eye(class_num)[indices] positives = y_pred.sum(axis=0) true_positives = (y_true * y_pred).sum(axis=0) epsilon = sys.float_info.min prec = (true_positives / (positives + epsilon)) return prec
[文档]def recall(preds, labels): r""" Calculates the recall. Recall is also referred to as the true positive rate or sensitivity. The function is shown as follows: .. math:: \text{Recall} =\frac{\text{TP}} {\text{TP} + \text{FN}} where `TP` is the number of true posistive cases, `FN` is the number of false negative cases. Args: preds (Union[Tensor, list, np.ndarray]): Predicted value. `preds` is a list of floating numbers in range :math:`[0, 1]` and the shape of `preds` is :math:`(N, C)` in most cases (not strictly), where :math:`N` is the number of cases and :math:`C` is the number of categories. labels (Union[Tensor, list, np.ndarray]): Ground truth. `labels` must be in one-hot format that shape is :math:`(N, C)`, or can be transformed to one-hot format that shape is :math:`(N,)`. Returns: - **rec** (np.ndarray) - The computed result. Raises: ValueError: If `preds` doesn't have the same classes number as `labels`. Example: >>> import numpy as np >>> import mindspore >>> from mindspore import Tensor >>> from mindnlp.common.metrics import recall >>> preds = Tensor(np.array([[0.2, 0.5], [0.3, 0.1], [0.9, 0.6]]), mindspore.float32) >>> labels = Tensor(np.array([1, 0, 1]), mindspore.int32) >>> rec = recall(preds, labels) >>> print(rec) [1. 0.5] """ y_pred = _convert_data_type(preds) y_true = _convert_data_type(labels) if y_pred.ndim == y_true.ndim and _check_onehot_data(y_true): y_true = y_true.argmax(axis=1) _check_shape(y_pred, y_true) class_num = y_pred.shape[1] if y_true.max() + 1 > class_num: raise ValueError(f'`preds` should have the same classes number as `labels`, but got `preds`' f' classes {class_num}, true value classes {y_true.max() + 1}.') y_true = np.eye(class_num)[y_true.reshape(-1)] indices = y_pred.argmax(axis=1).reshape(-1) y_pred = np.eye(class_num)[indices] actual_positives = y_true.sum(axis=0) true_positives = (y_true * y_pred).sum(axis=0) epsilon = sys.float_info.min rec = (true_positives / (actual_positives + epsilon)) return rec
[文档]def f1_score(preds, labels): r""" Calculates the F1 score. Fbeta score is a weighted mean of precision and recall, and F1 score is a special case of Fbeta when beta is 1. The function is shown as follows: .. math:: F_1=\frac{2\cdot TP}{2\cdot TP + FN + FP} where `TP` is the number of true posistive cases, `FN` is the number of false negative cases, `FP` is the number of false positive cases. Args: preds (Union[Tensor, list, np.ndarray]): Predicted value. `preds` is a list of floating numbers in range :math:`[0, 1]` and the shape of `preds` is :math:`(N, C)` in most cases (not strictly), where :math:`N` is the number of cases and :math:`C` is the number of categories. labels (Union[Tensor, list, np.ndarray]): Ground truth. `labels` must be in one-hot format that shape is :math:`(N, C)`, or can be transformed to one-hot format that shape is :math:`(N,)`. Returns: - **f1_s** (np.ndarray) - The computed result. Raises: ValueError: If `preds` doesn't have the same classes number as `labels`. Example: >>> import numpy as np >>> import mindspore >>> from mindspore import Tensor >>> from mindnlp.common.metrics import f1_score >>> preds = Tensor(np.array([[0.2, 0.5], [0.3, 0.1], [0.9, 0.6]])) >>> labels = Tensor(np.array([1, 0, 1])) >>> f1_s = f1_score(preds, labels) >>> print(f1_s) [0.6666666666666666 0.6666666666666666] """ y_pred = _convert_data_type(preds) y_true = _convert_data_type(labels) if y_pred.ndim == y_true.ndim and _check_onehot_data(y_true): y_true = y_true.argmax(axis=1) _check_shape(y_pred, y_true) class_num = y_pred.shape[1] if y_true.max() + 1 > class_num: raise ValueError(f'`preds` and `labels` should contain same classes, but got `preds` ' f'contains {class_num} classes and true value contains ' f'{y_true.max() + 1}') y_true = np.eye(class_num)[y_true.reshape(-1)] indices = y_pred.argmax(axis=1).reshape(-1) y_pred = np.eye(class_num)[indices] positives = y_pred.sum(axis=0) actual_positives = y_true.sum(axis=0) true_positives = (y_true * y_pred).sum(axis=0) epsilon = sys.float_info.min f1_s = (2 * true_positives / (actual_positives + positives + epsilon)) return f1_s
[文档]def matthews_correlation(preds, labels): r""" Calculates the Matthews correlation coefficient (MCC). MCC is in essence a correlation coefficient between the observed and predicted binary classifications; it returns a value between −1 and +1. A coefficient of +1 represents a perfect prediction, 0 no better than random prediction and −1 indicates total disagreement between prediction and observation. The function is shown as follows: .. math:: MCC=\frac{TP \times TN-FP \times FN}{\sqrt{(TP+FP)(TP+FN)(TN+FP)(TN+FN)}} where `TP` is the number of true posistive cases, `TN` is the number of true negative cases, `FN` is the number of false negative cases, `FP` is the number of false positive cases. Args: preds (Union[Tensor, list, np.ndarray]): Predicted value. `preds` is a list of floating numbers and the shape of `preds` is :math:`(N, C)` in most cases (not strictly), where :math:`N` is the number of cases and :math:`C` is the number of categories. labels (Union[Tensor, list, np.ndarray]): Ground truth. `labels` must be in one-hot format that shape is :math:`(N, C)`, or can be transformed to one-hot format that shape is :math:`(N,)`. Returns: - **m_c_c** (float) - The computed result. Example: >>> import numpy as np >>> import mindspore >>> from mindspore import Tensor >>> from mindnlp.common.metrics import matthews_correlation >>> preds = [[0.8, 0.2], [-0.5, 0.5], [0.1, 0.4], [0.6, 0.3], [0.6, 0.3]] >>> labels = [0, 1, 0, 1, 0] >>> m_c_c = matthews_correlation(preds, labels) >>> print(m_c_c) 0.16666666666666666 """ preds = _convert_data_type(preds) labels = _convert_data_type(labels) if preds.ndim == labels.ndim and _check_onehot_data(labels): labels = labels.argmax(axis=1) _check_shape(preds, labels) preds = np.argmax(preds, axis=1) labels = labels.reshape(-1, 1) t_p = 0 f_p = 0 t_n = 0 f_n = 0 sample_num = labels.shape[0] for i in range(sample_num): pred = preds[i] label = labels[i] if pred == 1: if pred == label: t_p += 1 else: f_p += 1 else: if pred == label: t_n += 1 else: f_n += 1 if t_p == 0 or f_p == 0 or t_n == 0 or f_n == 0: m_c_c = 0.0 else: m_c_c = (t_p * t_n - f_p * f_n) / math.sqrt( (t_p + f_p) * (t_p + f_n) * (t_n + f_p) * (t_n + f_n)) return m_c_c
[文档]def pearson_correlation(preds, labels): r""" Calculates the Pearson correlation coefficient (PCC). PCC is a measure of linear correlation between two sets of data. It is the ratio between the covariance of two variables and the product of their standard deviations; thus, it is essentially a normalized measurement of the covariance, such that the result always has a value between −1 and 1. Args: preds (Union[Tensor, list, np.ndarray]): Predicted value. `preds` is a list of floating numbers and the shape of `preds` is :math:`(N, 1)`. labels (Union[Tensor, list, np.ndarray]): Ground truth. `labels` is a list of floating numbers and the shape of `preds` is :math:`(N, 1)`. Returns: - **p_c_c** (float) - The computed result. Raises: RuntimeError: If `preds` and `labels` have different lengths. Example: >>> import numpy as np >>> import mindspore >>> from mindspore import Tensor >>> from mindnlp.common.metrics import pearson_correlation >>> preds = Tensor(np.array([[0.1], [1.0], [2.4], [0.9]]), mindspore.float32) >>> labels = Tensor(np.array([[0.0], [1.0], [2.9], [1.0]]), mindspore.float32) >>> p_c_c = pearson_correlation(preds, labels) >>> print(p_c_c) 0.9985229081857804 """ def _pearson_correlation(y_pred, y_true): n_pred = len(y_pred) # simple sums sum1 = sum(float(y_pred[i]) for i in range(n_pred)) sum2 = sum(float(y_true[i]) for i in range(n_pred)) # sum up the squares sum1_pow = sum(pow(v, 2.0) for v in y_pred) sum2_pow = sum(pow(v, 2.0) for v in y_true) # sum up the products p_sum = sum(y_pred[i] * y_true[i] for i in range(n_pred)) numerator = p_sum - (sum1 * sum2 / n_pred) denominator = math.sqrt( (sum1_pow - pow(sum1, 2) / n_pred) * (sum2_pow - pow(sum2, 2) / n_pred)) if denominator == 0: return 0.0 return numerator / denominator preds = _convert_data_type(preds) labels = _convert_data_type(labels) preds = np.squeeze(preds.reshape(-1, 1)).tolist() labels = np.squeeze(labels.reshape(-1, 1)).tolist() if len(preds) != len(labels): raise RuntimeError(f'`preds` and `labels` should have the same length, but got `preds` ' f'length {len(preds)}, `labels` length {len(labels)})') p_c_c = _pearson_correlation(preds, labels) return p_c_c
[文档]def spearman_correlation(preds, labels): r""" Calculates the Spearman's rank correlation coefficient (SRCC). It is a nonparametric measure of rank correlation (statistical dependence between the rankings of two variables). It assesses how well the relationship between two variables can be described using a monotonic function. If there are no repeated data values, a perfect Spearman correlation of +1 or −1 occurs when each of the variables is a perfect monotone function of the other. Args: preds (Union[Tensor, list, np.ndarray]): Predicted value. `preds` is a list of floating numbers and the shape of `preds` is :math:`(N, 1)`. labels (Union[Tensor, list, np.ndarray]): Ground truth. `labels` is a list of floating numbers and the shape of `preds` is :math:`(N, 1)`. Returns: - **s_r_c_c** (float) - The computed result. Raises: RuntimeError: If `preds` and `labels` have different lengths. Example: >>> import numpy as np >>> import mindspore >>> from mindspore import Tensor >>> from mindnlp.common.metrics import spearman_correlation >>> preds = Tensor(np.array([[0.1], [1.0], [2.4], [0.9]]), mindspore.float32) >>> labels = Tensor(np.array([[0.0], [1.0], [2.9], [1.0]]), mindspore.float32) >>> s_r_c_c = spearman_correlation(preds, labels) >>> print(s_r_c_c) 1.0 """ def _spearman(y_pred, y_true): preds_rank = _get_rank(y_pred) labels_rank = _get_rank(y_true) total = 0 n_pred = len(y_pred) for i in range(n_pred): total += pow((preds_rank[i] - labels_rank[i]), 2) res = 1 - float(6 * total) / (n_pred * (pow(n_pred, 2) - 1)) return res preds = _convert_data_type(preds) labels = _convert_data_type(labels) preds = np.squeeze(preds.reshape(-1, 1)).tolist() labels = np.squeeze(labels.reshape(-1, 1)).tolist() if len(preds) != len(labels): raise RuntimeError(f'`preds` and `labels` should have the same length, but got `preds` ' f'length {len(preds)}, `labels` length {len(labels)})') s_r_c_c = _spearman(preds, labels) return s_r_c_c
[文档]def em_score(preds, examples): r""" Calculates the exact match (EM) score. This metric measures the percentage of predictions that match any one of the ground truth exactly. Args: preds (Union[str, list]): Predicted value. examples (list): Ground truth. Returns: - **exact_match** (float) - The computed result. Raises: RuntimeError: If `preds` and `examples` have different lengths. Example: >>> import numpy as np >>> import mindspore >>> from mindspore import Tensor >>> from mindnlp.common.metrics import em_score >>> preds = "this is the best span" >>> examples = ["this is a good span", "something irrelevant"] >>> exact_match = em_score(preds, examples) >>> print(exact_match) 0.0 """ _check_value_type("preds", preds, [str, list]) _check_value_type("examples", examples, [list]) if not isinstance(preds, list): preds = [preds] examples = [examples] if len(preds) != len(examples): raise RuntimeError(f'`preds` and `examples` should have the same length, but got `examples`' f' length {len(preds)}, `labels` length {len(examples)})') count = len(preds) exact_match = 0 for pred, example in zip(preds, examples): exact_match += _metric_max_over_ground_truths( _compute_exact, pred, example ) total_em = int(exact_match) exact_match = total_em / count if count > 0 else 0 return exact_match
[文档]def confusion_matrix(preds, labels, class_num=2): r""" Calculates the confusion matrix. Confusion matrix is commonly used to evaluate the performance of classification models, including binary classification and multiple classification. Args: preds (Union[Tensor, list, np.ndarray]): Predicted value. `preds` is a list of floating numbers and the shape of `preds` is :math:`(N, C)` or :math:`(N,)`. labels (Union[Tensor, list, np.ndarray]): Ground truth. The shape of `labels` is :math:`(N,)`. class_num (int): Number of classes in the dataset. Default: 2. Returns: - **conf_mat** (np.ndarray) - The computed result. Raises: ValueError: If `preds` and `labels` do not have valid dimensions. Example: >>> import numpy as np >>> import mindspore >>> from mindspore import Tensor >>> from mindnlp.common.metrics import confusion_matrix >>> preds = Tensor(np.array([1, 0, 1, 0])) >>> labels = Tensor(np.array([1, 0, 0, 1])) >>> conf_mat = confusion_matrix(preds, labels) >>> print(conf_mat) [[1. 1.] [1. 1.]] """ class_num = _check_value_type("class_num", class_num, [int]) preds = _convert_data_type(preds) labels = _convert_data_type(labels) if preds.ndim not in (labels.ndim, labels.ndim + 1): raise ValueError(f'`preds` and `labels` should have the same dimensions, or the dimension ' f'of `preds` equals the dimension of true value add 1, but got `preds` ' f'ndim: {preds.ndim}, `labels` ndim: {labels.ndim}.') if preds.ndim == labels.ndim + 1: preds = np.argmax(preds, axis=1) trans = (labels.reshape(-1) * class_num + preds.reshape(-1)).astype(int) bincount = np.bincount(trans, minlength=class_num ** 2) conf_mat = bincount.reshape(class_num, class_num) conf_mat = conf_mat.astype(float) return conf_mat
# Common functions. def _check_value_type(arg_name, arg_value, valid_types): """ Checks whether the data type of argument is valid Args: arg_name (str): Name of the argument validated. arg_value (Object): Value of the argument validated. valid_types (list): Valid data types of the argument. Returns: - **arg_value** (Object) - Value of the argument validated. Raises: TypeError: If the data type of the argument is not valid. """ valid_types = valid_types if isinstance(valid_types, Iterable) else (valid_types,) num_types = len(valid_types) if isinstance(arg_value, bool) and bool not in tuple(valid_types): raise TypeError(f'Type of `{arg_name}` should be {"one of " if num_types > 1 else ""}' f' `{valid_types if num_types > 1 else str(valid_types[0])}`, ' f'but got `{arg_value}` with type `{type(arg_value).__name__}`.') if not isinstance(arg_value, tuple(valid_types)): raise TypeError(f'Type of `{arg_name}` should be {"one of " if num_types > 1 else ""}' f'`{valid_types if num_types > 1 else str(valid_types[0])}`, ' f'but got `{arg_value}` with type `{type(arg_value).__name__}`.') return arg_value def _check_onehot_data(data): """ Checks whether input data is one-hot encoding. Args: data (np.array): Input data. Returns: - **ans** (bool) - Rreturn true, if input data is one-hot encoding. """ ans = False data = _convert_data_type(data) if np.equal(data ** 2, data).all(): shp = (data.shape[0],) + data.shape[2:] if np.equal(np.ones(shp), data.sum(axis=1)).all(): ans = True return ans return ans def _convert_data_type(data): """ Converts data type to numpy array. Args: data (Union[Tensor, list, np.ndarray]): Input data. Returns: - **data** (np.ndarray) - Data with `np.ndarray` type. Raises: TypeError: If the data type is not tensor, list or np.ndarray. """ if isinstance(data, Tensor): data = data.asnumpy() elif isinstance(data, list): data = np.array(data) elif isinstance(data, np.ndarray): pass else: raise TypeError(f'Input data type must be tensor, list or ' f'np.ndarray, but got {type(data)}.') return data def _count_ngram(input_list, n_gram): ngram_counter = Counter() for i in range(1, n_gram + 1): for j in range(len(input_list) - i + 1): ngram_key = tuple(input_list[j:(i + j)]) ngram_counter[ngram_key] += 1 return ngram_counter def _check_shape(y_pred, y_true, n_class=None): """ Checks the shapes of y_pred and y_true. Args: y_pred (Tensor): Predict tensor. y_true (Tensor): Target tensor. """ if y_pred.ndim != y_true.ndim + 1 and n_class != 1: raise ValueError(f'The dimension of `y_pred` should be equal to the dimension of `y_true` ' f'add 1, but got `y_pred` dimension: {y_pred.ndim} and `y_true` dimension:' f' {y_true.ndim}.') if y_true.shape != (y_pred.shape[0],) + y_pred.shape[2:] and n_class != 1: raise ValueError(f'`y_pred` shape and `y_true` shape can not match, `y_true` shape should ' f'be equal to `y_pred` shape that the value at index 1 is deleted. Such as' f' `y_pred` shape (1, 2, 3), then `y_true` shape should be (1, 3). But got' f' `y_pred` shape {y_pred.shape} and `y_true` shape {y_true.shape}.') def _get_ngrams(words, n_size=1): """ Calculates n-gram for multiple sentences. """ ngram_set = set() max_start = len(words) - n_size for i in range(max_start + 1): print(tuple(words[i:i + n_size])) ngram_set.add(tuple(words[i:i + n_size])) return ngram_set def _lcs(strg, sub): """ Calculates the length of longest common subsequence of strg and sub. Args: strg (list): The string to be calculated, usually longer the sub string. sub (list): The sub string to be calculated. Returns: - **length** (float) - The length of the longest common subsequence of string and sub. """ if len(strg) < len(sub): sub, strg = strg, sub lengths = np.zeros((len(strg) + 1, len(sub) + 1)) for j in range(1, len(sub) + 1): for i in range(1, len(strg) + 1): if strg[i - 1] == sub[j - 1]: lengths[i][j] = lengths[i - 1][j - 1] + 1 else: lengths[i][j] = max(lengths[i - 1][j], lengths[i][j - 1]) length = lengths[len(strg)][len(sub)] return length def _get_rank(raw_list): raw_x = np.array(raw_list) rank_x = np.empty(raw_x.shape, dtype=int) sort_x = np.argsort(-raw_x) for i, k in enumerate(sort_x): rank_x[k] = i + 1 return rank_x def _compute_exact(y_pred, y_true): def _normalize_answer(txt): """Lowers text and removes punctuation, articles and extra whitespace.""" def remove_articles(text): regex = re.compile(r"\b(a|an|the)\b", re.UNICODE) return re.sub(regex, " ", text) def white_space_fix(text): return " ".join(text.split()) def remove_punc(text): exclude = set(string.punctuation) return "".join(ch for ch in text if ch not in exclude) def lower(text): return text.lower() return white_space_fix(remove_articles(remove_punc(lower(txt)))) return int(_normalize_answer(y_pred) == _normalize_answer(y_true)) def _metric_max_over_ground_truths(metric_fn, pred, example): scores_for_ground_truths = [] for y_eg in example: score = metric_fn(pred, y_eg) scores_for_ground_truths.append(score) return round(max(scores_for_ground_truths), 2)