Source code for biosppy.stats

# -*- coding: utf-8 -*-
"""
biosppy.stats
-------------

This module provides statistical functions and related tools.

:copyright: (c) 2015-2023 by Instituto de Telecomunicacoes
:license: BSD 3-clause, see LICENSE for more details.
"""

# Imports
# compat
from __future__ import absolute_import, division, print_function
import six

# local
from . import utils
from .signals import tools

# 3rd party
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr, ttest_rel, ttest_ind


[docs]def pearson_correlation(x=None, y=None): """Compute the Pearson Correlation Coefficient between two signals. The coefficient is given by: .. math:: r_{xy} = \\frac{E[(X - \\mu_X) (Y - \\mu_Y)]}{\\sigma_X \\sigma_Y} Parameters ---------- x : array First input signal. y : array Second input signal. Returns ------- r : float Pearson correlation coefficient, ranging between -1 and +1. pvalue : float Two-tailed p-value. The p-value roughly indicates the probability of an uncorrelated system producing datasets that have a Pearson correlation at least as extreme as the one computed from these datasets. Raises ------ ValueError If the input signals do not have the same length. """ # check inputs if x is None: raise TypeError("Please specify the first input signal.") if y is None: raise TypeError("Please specify the second input signal.") # ensure numpy x = np.array(x) y = np.array(y) n = len(x) if n != len(y): raise ValueError("Input signals must have the same length.") r, pvalue = pearsonr(x, y) args = (r, pvalue) names = ('r', 'pvalue') return utils.ReturnTuple(args, names)
[docs]def linear_regression(x=None, y=None, show=True): """Plot the linear regression between two signals and get the equation coefficients. The linear regression uses the least squares method. Parameters ---------- x : array First input signal. y : array Second input signal. show : bool If True, show the plot. Returns ------- coeffs : array Linear regression coefficients: [m, b]. Raises ------ ValueError If the input signals do not have the same length. """ # check inputs if x is None: raise TypeError("Please specify the first input signal.") if y is None: raise TypeError("Please specify the second input signal.") # ensure numpy x = np.array(x) y = np.array(y) n = len(x) if n != len(y): raise ValueError("Input signals must have the same length.") coeffs = np.polyfit(x, y, 1) f = np.poly1d(coeffs) x_min = x.min() x_max = x.max() y_min = f(x_min) y_max = f(x_max) if show: plt.scatter(x, y) plt.plot( [x_min, x_max], [y_min, y_max], c="orange", label="y={:.3f}x+{:.3f}".format(coeffs[0], coeffs[1]), ) plt.title("Linear Regression") plt.xlabel("x") plt.ylabel("y") plt.legend() args = (coeffs[0], coeffs[1]) labels = ["m", "b"] return utils.ReturnTuple(args, labels)
[docs]def paired_test(x=None, y=None): """ Perform the Student's paired t-test on the arrays x and y. This is a two-sided test for the null hypothesis that 2 related or repeated samples have identical average (expected) values. Parameters ---------- x : array First input signal. y : array Second input signal. Returns ------- statistic : float t-statistic. The t-statistic is used in a t-test to determine if you should support or reject the null hypothesis. pvalue : float Two-sided p-value. Raises ------ ValueError If the input signals do not have the same length. """ # check inputs if x is None: raise TypeError("Please specify the first input signal.") if y is None: raise TypeError("Please specify the second input signal.") # ensure numpy x = np.array(x) y = np.array(y) n = len(x) if n != len(y): raise ValueError("Input signals must have the same length.") statistic, pvalue = ttest_rel(x, y) return statistic, pvalue
[docs]def unpaired_test(x=None, y=None): """ Perform the Student's unpaired t-test on the arrays x and y. This is a two-sided test for the null hypothesis that 2 independent samples have identical average (expected) values. This test assumes that the populations have identical variances by default. Parameters ---------- x : array First input signal. y : array Second input signal. Returns ------- statistic : float t-statistic. The t-statistic is used in a t-test to determine if you should support or reject the null hypothesis. pvalue : float Two-sided p-value. Raises ------ ValueError If the input signals do not have the same length. """ # check inputs if x is None: raise TypeError("Please specify the first input signal.") if y is None: raise TypeError("Please specify the second input signal.") # ensure numpy x = np.array(x) y = np.array(y) n = len(x) if n != len(y): raise ValueError("Input signals must have the same length.") statistic, pvalue = ttest_ind(x, y) return statistic, pvalue
[docs]def histogram(signal=None, bins=5, normalize=True): """Compute histogram of the input signal. Parameters ---------- signal : array Input signal. bins : int, optional Number of histogram bins. Default is 5. normalize : bool, optional Whether to normalize the histogram counts. Default is True. Returns ------- hist{bin}_bins : float Number of counts of the bin. If `normalize` is True, the counts are normalized. """ # check inputs if signal is None: raise TypeError("Please specify an input signal.") # ensure input formats signal = np.array(signal) bins = int(bins) # initialize output out = utils.ReturnTuple((), ()) # compute histogram hist = np.histogram(signal, bins=bins)[0] if normalize: hist = hist / np.sum(hist) # normalization # add counts for index, count in enumerate(hist): out = out.append(count, 'hist_' + str(index+1) + '_' + str(bins)) return out
[docs]def quartiles(signal=None): """Compute quartile features of the signal. Parameters ---------- signal : array Input signal. Returns ------- q1 : float First quartile. q2 : float Second quartile, also known as median. q3 : float Third quartile. iqr : float Interquartile range. midhinge : float Midhinge. trimean : float Trimean. """ # check inputs if signal is None: raise TypeError("Please specify an input signal.") # ensure input formats signal = np.array(signal) # initialize output out = utils.ReturnTuple((), ()) # compute quartiles q1, q2, q3 = np.quantile(signal, [0.25, 0.5, 0.75]) out = out.append([q1, q2, q3], ['q1', 'q2', 'q3']) # iqr iqr = q3 - q1 out = out.append(iqr, 'iqr') # midhinge midhinge = (q3 + q1) / 2 out = out.append(midhinge, 'midhinge') # trimean trimean = (q2 + midhinge) / 2 out = out.append(trimean, 'trimean') return out
[docs]def diff_stats(signal=None, stats_only=True): """Compute statistical features from the first signal differences, second signal differences and absolute signal differences. Parameters ---------- signal : array Input signal. stats_only : bool, optional Whether to output only statistical features. Default is True. Returns ------- {diff} : array Difference signal. {diff} can be 'diff', 'diff2' or 'abs_diff'. {diff}_mean : float Mean of the difference signal. {diff}_median : float Median of the difference signal. {diff}_min : float Minimum of the difference signal. {diff}_max : float Maximum of the difference signal. {diff}_max_amp : float Maximum amplitude of the difference signal. {diff}_range : float Range of the difference signal. {diff}_var : float Variance of the difference signal. {diff}_std : float Standard deviation of the difference signal. {diff}_sum : float Sum of the difference signal. """ # check inputs if signal is None: raise TypeError("Please specify an input signal.") # ensure numpy signal = np.array(signal) # initialize output out = utils.ReturnTuple((), ()) # compute differences sig_diff = np.diff(signal) sig_diff_2 = np.diff(sig_diff) sig_diff_abs = np.abs(sig_diff) diffs = [sig_diff, sig_diff_2, sig_diff_abs] labels = ['firstdiff', 'seconddiff', 'absdiff'] # extract features for diff, label in zip(diffs, labels): # add to output if not stats_only: out = out.append(diff, label) # compute stats diff_stat = tools.signal_stats(diff) # add to output for arg, name in zip(diff_stat, diff_stat.keys()): out = out.append(arg, label + '_' + name) # sum sum_ = np.sum(diff) out = out.append(sum_, label + '_' + 'sum') return out