summaryrefslogtreecommitdiff
path: root/stats
diff options
context:
space:
mode:
Diffstat (limited to 'stats')
-rw-r--r--stats/student_t_distribution.py19
-rw-r--r--stats/t-test.py50
2 files changed, 69 insertions, 0 deletions
diff --git a/stats/student_t_distribution.py b/stats/student_t_distribution.py
new file mode 100644
index 0000000..ddea26c
--- /dev/null
+++ b/stats/student_t_distribution.py
@@ -0,0 +1,19 @@
+
+from scipy.special import gamma
+import math
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+def t_dist(t, v):
+ return gamma((v+1)/2)/(math.sqrt(v*math.pi)*gamma(v/2)) * (1+t**2/v)**(-(v+1)/2)
+
+
+x = np.arange(-4, 4, .1)
+plt.plot(x, t_dist(x, 1), label='v=1')
+plt.plot(x, t_dist(x, 2), label='v=2')
+plt.plot(x, t_dist(x, 5), label='v=5')
+# plt.plot(x, t_dist(x, math.inf), label='v=inf')
+plt.legend()
+plt.show()
+
diff --git a/stats/t-test.py b/stats/t-test.py
new file mode 100644
index 0000000..3ebba84
--- /dev/null
+++ b/stats/t-test.py
@@ -0,0 +1,50 @@
+## Import the packages
+import numpy as np
+from scipy import stats
+
+
+## Define 2 random distributions
+#Sample Size
+N = 10
+#Gaussian distributed data with mean = 2 and var = 1
+a = np.random.randn(N) + 2
+#Gaussian distributed data with with mean = 0 and var = 1
+b = np.random.randn(N)
+
+
+## Calculate the Standard Deviation
+#Calculate the variance to get the standard deviation
+
+#For unbiased max likelihood estimate we have to divide the var by N-1, and therefore the parameter ddof = 1
+var_a = a.var(ddof=1)
+var_b = b.var(ddof=1)
+
+#std deviation
+s = np.sqrt((var_a + var_b)/2)
+print(a)
+print(b)
+
+
+
+## Calculate the t-statistics
+t = (a.mean() - b.mean())/(s*np.sqrt(2/N))
+
+
+
+## Compare with the critical t-value
+#Degrees of freedom
+df = 2*N - 2
+
+#p-value after comparison with the t
+p = 1 - stats.t.cdf(t,df=df)
+
+
+print("t = " + str(t))
+print("p = " + str(2*p)) # 双尾
+### You can see that after comparing the t statistic with the critical t value (computed internally) we get a good p value of 0.0005 and thus we reject the null hypothesis and thus it proves that the mean of the two distributions are different and statistically significant.
+
+
+## Cross Checking with the internal scipy function
+t2, p2 = stats.ttest_ind(a,b)
+print("t = " + str(t2))
+print("p = " + str(p2))