diff options
Diffstat (limited to 'stats')
| -rw-r--r-- | stats/student_t_distribution.py | 19 | ||||
| -rw-r--r-- | stats/t-test.py | 50 |
2 files changed, 69 insertions, 0 deletions
diff --git a/stats/student_t_distribution.py b/stats/student_t_distribution.py new file mode 100644 index 0000000..ddea26c --- /dev/null +++ b/stats/student_t_distribution.py @@ -0,0 +1,19 @@ + +from scipy.special import gamma +import math +import matplotlib.pyplot as plt +import numpy as np + + +def t_dist(t, v): + return gamma((v+1)/2)/(math.sqrt(v*math.pi)*gamma(v/2)) * (1+t**2/v)**(-(v+1)/2) + + +x = np.arange(-4, 4, .1) +plt.plot(x, t_dist(x, 1), label='v=1') +plt.plot(x, t_dist(x, 2), label='v=2') +plt.plot(x, t_dist(x, 5), label='v=5') +# plt.plot(x, t_dist(x, math.inf), label='v=inf') +plt.legend() +plt.show() + diff --git a/stats/t-test.py b/stats/t-test.py new file mode 100644 index 0000000..3ebba84 --- /dev/null +++ b/stats/t-test.py @@ -0,0 +1,50 @@ +## Import the packages +import numpy as np +from scipy import stats + + +## Define 2 random distributions +#Sample Size +N = 10 +#Gaussian distributed data with mean = 2 and var = 1 +a = np.random.randn(N) + 2 +#Gaussian distributed data with with mean = 0 and var = 1 +b = np.random.randn(N) + + +## Calculate the Standard Deviation +#Calculate the variance to get the standard deviation + +#For unbiased max likelihood estimate we have to divide the var by N-1, and therefore the parameter ddof = 1 +var_a = a.var(ddof=1) +var_b = b.var(ddof=1) + +#std deviation +s = np.sqrt((var_a + var_b)/2) +print(a) +print(b) + + + +## Calculate the t-statistics +t = (a.mean() - b.mean())/(s*np.sqrt(2/N)) + + + +## Compare with the critical t-value +#Degrees of freedom +df = 2*N - 2 + +#p-value after comparison with the t +p = 1 - stats.t.cdf(t,df=df) + + +print("t = " + str(t)) +print("p = " + str(2*p)) # 双尾 +### You can see that after comparing the t statistic with the critical t value (computed internally) we get a good p value of 0.0005 and thus we reject the null hypothesis and thus it proves that the mean of the two distributions are different and statistically significant. + + +## Cross Checking with the internal scipy function +t2, p2 = stats.ttest_ind(a,b) +print("t = " + str(t2)) +print("p = " + str(p2)) |
