# -*- coding: utf-8 -*-
"""
biosppy.stats
-------------
This module provides statistical functions and related tools.
:copyright: (c) 2015-2023 by Instituto de Telecomunicacoes
:license: BSD 3-clause, see LICENSE for more details.
"""
# Imports
# compat
from __future__ import absolute_import, division, print_function
import six
# local
from . import utils
from .signals import tools
# 3rd party
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr, ttest_rel, ttest_ind
[docs]def pearson_correlation(x=None, y=None):
"""Compute the Pearson Correlation Coefficient between two signals.
The coefficient is given by:
.. math::
r_{xy} = \\frac{E[(X - \\mu_X) (Y - \\mu_Y)]}{\\sigma_X \\sigma_Y}
Parameters
----------
x : array
First input signal.
y : array
Second input signal.
Returns
-------
r : float
Pearson correlation coefficient, ranging between -1 and +1.
pvalue : float
Two-tailed p-value. The p-value roughly indicates the probability of
an uncorrelated system producing datasets that have a Pearson correlation
at least as extreme as the one computed from these datasets.
Raises
------
ValueError
If the input signals do not have the same length.
"""
# check inputs
if x is None:
raise TypeError("Please specify the first input signal.")
if y is None:
raise TypeError("Please specify the second input signal.")
# ensure numpy
x = np.array(x)
y = np.array(y)
n = len(x)
if n != len(y):
raise ValueError("Input signals must have the same length.")
r, pvalue = pearsonr(x, y)
args = (r, pvalue)
names = ('r', 'pvalue')
return utils.ReturnTuple(args, names)
[docs]def linear_regression(x=None, y=None, show=True):
"""Plot the linear regression between two signals and get the equation coefficients.
The linear regression uses the least squares method.
Parameters
----------
x : array
First input signal.
y : array
Second input signal.
show : bool
If True, show the plot.
Returns
-------
coeffs : array
Linear regression coefficients: [m, b].
Raises
------
ValueError
If the input signals do not have the same length.
"""
# check inputs
if x is None:
raise TypeError("Please specify the first input signal.")
if y is None:
raise TypeError("Please specify the second input signal.")
# ensure numpy
x = np.array(x)
y = np.array(y)
n = len(x)
if n != len(y):
raise ValueError("Input signals must have the same length.")
coeffs = np.polyfit(x, y, 1)
f = np.poly1d(coeffs)
x_min = x.min()
x_max = x.max()
y_min = f(x_min)
y_max = f(x_max)
if show:
plt.scatter(x, y)
plt.plot(
[x_min, x_max],
[y_min, y_max],
c="orange",
label="y={:.3f}x+{:.3f}".format(coeffs[0], coeffs[1]),
)
plt.title("Linear Regression")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
args = (coeffs[0], coeffs[1])
labels = ["m", "b"]
return utils.ReturnTuple(args, labels)
[docs]def paired_test(x=None, y=None):
"""
Perform the Student's paired t-test on the arrays x and y.
This is a two-sided test for the null hypothesis that 2 related
or repeated samples have identical average (expected) values.
Parameters
----------
x : array
First input signal.
y : array
Second input signal.
Returns
-------
statistic : float
t-statistic. The t-statistic is used in a t-test to determine
if you should support or reject the null hypothesis.
pvalue : float
Two-sided p-value.
Raises
------
ValueError
If the input signals do not have the same length.
"""
# check inputs
if x is None:
raise TypeError("Please specify the first input signal.")
if y is None:
raise TypeError("Please specify the second input signal.")
# ensure numpy
x = np.array(x)
y = np.array(y)
n = len(x)
if n != len(y):
raise ValueError("Input signals must have the same length.")
statistic, pvalue = ttest_rel(x, y)
return statistic, pvalue
[docs]def unpaired_test(x=None, y=None):
"""
Perform the Student's unpaired t-test on the arrays x and y.
This is a two-sided test for the null hypothesis that 2 independent
samples have identical average (expected) values. This test assumes
that the populations have identical variances by default.
Parameters
----------
x : array
First input signal.
y : array
Second input signal.
Returns
-------
statistic : float
t-statistic. The t-statistic is used in a t-test to determine
if you should support or reject the null hypothesis.
pvalue : float
Two-sided p-value.
Raises
------
ValueError
If the input signals do not have the same length.
"""
# check inputs
if x is None:
raise TypeError("Please specify the first input signal.")
if y is None:
raise TypeError("Please specify the second input signal.")
# ensure numpy
x = np.array(x)
y = np.array(y)
n = len(x)
if n != len(y):
raise ValueError("Input signals must have the same length.")
statistic, pvalue = ttest_ind(x, y)
return statistic, pvalue
[docs]def histogram(signal=None, bins=5, normalize=True):
"""Compute histogram of the input signal.
Parameters
----------
signal : array
Input signal.
bins : int, optional
Number of histogram bins. Default is 5.
normalize : bool, optional
Whether to normalize the histogram counts. Default is True.
Returns
-------
hist{bin}_bins : float
Number of counts of the bin. If `normalize` is True, the counts are normalized.
"""
# check inputs
if signal is None:
raise TypeError("Please specify an input signal.")
# ensure input formats
signal = np.array(signal)
bins = int(bins)
# initialize output
out = utils.ReturnTuple((), ())
# compute histogram
hist = np.histogram(signal, bins=bins)[0]
if normalize:
hist = hist / np.sum(hist) # normalization
# add counts
for index, count in enumerate(hist):
out = out.append(count, 'hist_' + str(index+1) + '_' + str(bins))
return out
[docs]def quartiles(signal=None):
"""Compute quartile features of the signal.
Parameters
----------
signal : array
Input signal.
Returns
-------
q1 : float
First quartile.
q2 : float
Second quartile, also known as median.
q3 : float
Third quartile.
iqr : float
Interquartile range.
midhinge : float
Midhinge.
trimean : float
Trimean.
"""
# check inputs
if signal is None:
raise TypeError("Please specify an input signal.")
# ensure input formats
signal = np.array(signal)
# initialize output
out = utils.ReturnTuple((), ())
# compute quartiles
q1, q2, q3 = np.quantile(signal, [0.25, 0.5, 0.75])
out = out.append([q1, q2, q3], ['q1', 'q2', 'q3'])
# iqr
iqr = q3 - q1
out = out.append(iqr, 'iqr')
# midhinge
midhinge = (q3 + q1) / 2
out = out.append(midhinge, 'midhinge')
# trimean
trimean = (q2 + midhinge) / 2
out = out.append(trimean, 'trimean')
return out
[docs]def diff_stats(signal=None, stats_only=True):
"""Compute statistical features from the first signal differences, second
signal differences and absolute signal differences.
Parameters
----------
signal : array
Input signal.
stats_only : bool, optional
Whether to output only statistical features. Default is True.
Returns
-------
{diff} : array
Difference signal. {diff} can be 'diff', 'diff2' or 'abs_diff'.
{diff}_mean : float
Mean of the difference signal.
{diff}_median : float
Median of the difference signal.
{diff}_min : float
Minimum of the difference signal.
{diff}_max : float
Maximum of the difference signal.
{diff}_max_amp : float
Maximum amplitude of the difference signal.
{diff}_range : float
Range of the difference signal.
{diff}_var : float
Variance of the difference signal.
{diff}_std : float
Standard deviation of the difference signal.
{diff}_sum : float
Sum of the difference signal.
"""
# check inputs
if signal is None:
raise TypeError("Please specify an input signal.")
# ensure numpy
signal = np.array(signal)
# initialize output
out = utils.ReturnTuple((), ())
# compute differences
sig_diff = np.diff(signal)
sig_diff_2 = np.diff(sig_diff)
sig_diff_abs = np.abs(sig_diff)
diffs = [sig_diff, sig_diff_2, sig_diff_abs]
labels = ['firstdiff', 'seconddiff', 'absdiff']
# extract features
for diff, label in zip(diffs, labels):
# add to output
if not stats_only:
out = out.append(diff, label)
# compute stats
diff_stat = tools.signal_stats(diff)
# add to output
for arg, name in zip(diff_stat, diff_stat.keys()):
out = out.append(arg, label + '_' + name)
# sum
sum_ = np.sum(diff)
out = out.append(sum_, label + '_' + 'sum')
return out