%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
return false;
}
import pandas as pd
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
import random
Reference
Yeh, A. (2000). More accurate tests for the statistical significance of result differences. arXiv preprint cs/0008005
Null Hypothesis: There is no difference in F-score between two methods
Algorithm:
df = pd.DataFrame(np.random.randint(0, 2, size=(1000, 3)), columns=['true', 'pred1', 'pred2'])
df.head()
true | pred1 | pred2 | |
---|---|---|---|
0 | 0 | 1 | 0 |
1 | 1 | 1 | 0 |
2 | 0 | 0 | 0 |
3 | 1 | 0 | 0 |
4 | 1 | 0 | 1 |
def simulate_fs(df, reference, col1, col2, n, two_sided=True, labels=[1, 0]):
if not two_sided:
print('\nOne-sided permutation test for', len(labels), 'F-scores, n_permutations =', n, '\n')
print('H0: F-score of', col1, 'is equal or smaller than F-score of', col2)
print('H1: F-score of', col1, 'is larger than F-score of', col2, end='\n\np: ')
else:
print('\nTwo-sided permutation test for', len(labels), 'F-scores, n_permutations =', n, '\n')
print('H0: F-score of', col1, 'is equal to F-score of', col2)
print('H1: F-score of', col1, 'is not equal to F-score of', col2, end='\n\np: ')
_, _, fscores1, _ = precision_recall_fscore_support(df[reference], df[col1],
labels=labels)
_, _, fscores2, _ = precision_recall_fscore_support(df[reference], df[col2],
labels=labels)
diffs = [a-b for a, b in zip(fscores1, fscores2)]
diffsdict = dict(zip(labels, diffs))
simulated = {}
for label in labels:
simulated[label] = []
for _ in range(n):
shuffle1 = []
shuffle2 = []
for a, b in zip(df[col1], df[col2]):
if random.randint(0, 1) == 1:
shuffle1.append(a)
shuffle2.append(b)
else:
shuffle1.append(b)
shuffle2.append(a)
_, _, fscores1, _ = precision_recall_fscore_support(df[reference], shuffle1,
labels=labels)
_, _, fscores2, _ = precision_recall_fscore_support(df[reference], shuffle2,
labels=labels)
diffs = [a-b for a, b in zip(fscores1, fscores2)]
for label, diff in zip(labels, diffs):
simulated[label].append(diff)
p = {}
for label in labels:
if not two_sided:
overdiff = [d for d in simulated[label] if d >= diffsdict[label]]
else:
overdiff = [d for d in simulated[label] if abs(d) >= abs(diffsdict[label])]
p[label] = len(overdiff)/n
return p
print(simulate_fs(df, 'true', 'pred1', 'pred2', 10000, two_sided=True))
print(simulate_fs(df, 'true', 'pred1', 'pred2', 10000, two_sided=False))
Two-sided permutation test for 2 F-scores, n_permutations = 10000 H0: F-score of pred1 is equal to F-score of pred2 H1: F-score of pred1 is not equal to F-score of pred2 p: {1: 0.6368, 0: 0.8709} One-sided permutation test for 2 F-scores, n_permutations = 10000 H0: F-score of pred1 is equal or smaller than F-score of pred2 H1: F-score of pred1 is larger than F-score of pred2 p: {1: 0.3122, 0: 0.4358}