feat: statistics chapter done
This commit is contained in:
@@ -3,11 +3,12 @@ import unittest
|
||||
|
||||
from modules.essential_math.examples.statistics_example import (
|
||||
normal_distribution_example,
|
||||
normal_distribution_exercise,
|
||||
t_distribution_example,
|
||||
basic_statistic_concepts_example,
|
||||
z_scores_example,
|
||||
final_exercises
|
||||
)
|
||||
|
||||
if __name__=="__main__":
|
||||
# basic_statistic_concepts_example()
|
||||
# normal_distribution_example()
|
||||
# z_scores_example()
|
||||
final_exercises()
|
||||
|
||||
@@ -16,6 +16,7 @@ from modules.essential_math.statistics import (
|
||||
generic_critical_z_value,
|
||||
margin_of_error,
|
||||
confidence_interval,
|
||||
get_critical_value_range_t,
|
||||
)
|
||||
|
||||
def basic_statistic_concepts_example():
|
||||
@@ -59,6 +60,55 @@ def normal_distribution_example():
|
||||
expected_value = inverse_cumulative_density_function(target_probability, mean, std_dev);
|
||||
print(">> For a probability of .5 we expect the value: ", expected_value)
|
||||
|
||||
def normal_distribution_exercise():
|
||||
# Population with cold MEAN recovery time of 18 days, with std_dev of 1.5 days.
|
||||
# Chances of recovery between 15 and 21 days
|
||||
mean = 18
|
||||
std_dev = 1.5
|
||||
init = 15
|
||||
end = 21
|
||||
chances = normal_cumulative_density_function(end, mean, std_dev) - normal_cumulative_density_function(init, mean, std_dev)
|
||||
print("Chances of recovering from a cold between 15 and 21 days: ", chances)
|
||||
print("Chances of recovering before 15 days or after 21: ", 1.0 - chances)
|
||||
# since its a normal distribution, the chances are equaly distributed
|
||||
print("Chances of recovering before 15 days: ", (1.0 - chances) / 2)
|
||||
|
||||
# Apply rug (or drug) to 40 people and see a 16 MEAN recovery time. Test if rug improved mean or casuality
|
||||
## One tailed atest: use inverse cdf in order to find the limit value for a given %.
|
||||
new_mean = 16
|
||||
min_target_percentage = 0.05 # this is a standard
|
||||
min_mean = inverse_cumulative_density_function(min_target_percentage, mean, std_dev)
|
||||
if (min_mean < new_mean):
|
||||
print("The rug (drug) did nothing.")
|
||||
else:
|
||||
print("The rug (drug) worked.")
|
||||
## One tailed test with a P value
|
||||
p_value = normal_cumulative_density_function(new_mean, mean, std_dev)
|
||||
if (p_value > min_target_percentage):
|
||||
print("The rug (drug) did nothing.")
|
||||
else:
|
||||
print("The rug (drug) worked.")
|
||||
|
||||
## Two tailed test (look for both sides of the normal distribution)
|
||||
## Double the checks, harder to prove (x2) and checks if the rug(drug) makes the recovery time worse.
|
||||
left_min_target = min_target_percentage / 2
|
||||
x1 = inverse_cumulative_density_function(left_min_target, mean, std_dev)
|
||||
x2 = inverse_cumulative_density_function(1.0 - left_min_target, mean, std_dev)
|
||||
if (new_mean < x1 or new_mean > x2):
|
||||
print("The rug (drug) worked.", x1, x2)
|
||||
else:
|
||||
print("The rug (drug) did nothing.", x1, x2)
|
||||
## Two tailed test with a P value
|
||||
p1 = normal_cumulative_density_function(new_mean, mean, std_dev)
|
||||
right_symetrical_mean = mean + (mean - new_mean)
|
||||
p2 = 1.0 - normal_cumulative_density_function(right_symetrical_mean, mean, std_dev)
|
||||
p_value = p1 + p2
|
||||
if (p_value < min_target_percentage):
|
||||
print("The rug (drug) worked.", p_value)
|
||||
else:
|
||||
print("The rug (drug) did nothing.", p_value)
|
||||
#### CONCEPT: P-hacking, searching for data (in big data scenarios) that passes the p_value < 0.05 test and claiming for a relation.
|
||||
|
||||
def z_scores_example():
|
||||
print("== Z-scores ==")
|
||||
|
||||
@@ -72,3 +122,47 @@ def central_limit_theorem_example():
|
||||
## Central limit theorem
|
||||
test_central_limit_theorem(sample_size=1, sample_count=1000)
|
||||
test_central_limit_theorem(sample_size=31, sample_count=1000)
|
||||
|
||||
def t_distribution_example():
|
||||
confidence = 0.95
|
||||
sample_size = 25
|
||||
(lower, upper) = get_critical_value_range_t(confidence, sample_size)
|
||||
print("The confidence interval is: ", lower, upper)
|
||||
|
||||
def final_exercises():
|
||||
# 1.
|
||||
pool_widths = (1.78, 1.75, 1.72, 1.74, 1.77)
|
||||
pool_width_mean = mean(pool_widths)
|
||||
pool_width_std = standard_deviation(pool_widths, True)
|
||||
print("1: ", pool_width_mean, pool_width_std)
|
||||
|
||||
# 2.
|
||||
z_mean = 42
|
||||
z_std_dev = 8
|
||||
z_prob_init = 20
|
||||
z_prob_end = 30
|
||||
z_prob_final = normal_cumulative_density_function(z_prob_end,z_mean,z_std_dev) - normal_cumulative_density_function(z_prob_init, z_mean, z_std_dev)
|
||||
print("2: ", z_prob_final)
|
||||
|
||||
# 3.
|
||||
filament_value = 1.75
|
||||
filament_sample_size = 34
|
||||
filament_mean = 1.715588
|
||||
filament_std_dev = 0.029252
|
||||
filament_percentage_conficence = .99
|
||||
filament_z_value = z_score(filament_value,filament_mean, filament_std_dev)
|
||||
(filament_confidence_init, filament_conficence_end) = confidence_interval(filament_percentage_conficence, filament_sample_size, filament_std_dev, filament_z_value, filament_mean)
|
||||
print("3: ", filament_confidence_init, filament_conficence_end)
|
||||
|
||||
# 4.
|
||||
original_sales_average = 10345
|
||||
original_sales_std_dev = 552
|
||||
new_sales_average = 11641
|
||||
min_sales_percentage = 0.05
|
||||
|
||||
sales_p1 = 1.0 - normal_cumulative_density_function(new_sales_average, original_sales_average, original_sales_std_dev)
|
||||
sales_p = sales_p1 * 2 # take advantage of symmetry
|
||||
if (sales_p < min_sales_percentage):
|
||||
print("The sales campaing worked", sales_p)
|
||||
else:
|
||||
print("The sales campaing did NOT work")
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
## Chapter 3 - Statistics
|
||||
|
||||
from math import sqrt, pi, e, exp
|
||||
from scipy.stats import norm
|
||||
from scipy.stats import norm, t
|
||||
|
||||
import random
|
||||
import plotly.express as px
|
||||
@@ -85,8 +85,8 @@ def test_central_limit_theorem(sample_size, sample_count):
|
||||
|
||||
def generic_critical_z_value(probability):
|
||||
norm_dist = norm(loc=0.0, scale=1.0)
|
||||
left_tail_area = (1.0 - p) / 2.0
|
||||
upper_area = 1.0 - ((1.0 - p) / 2.0)
|
||||
left_tail_area = (1.0 - probability) / 2.0
|
||||
upper_area = 1.0 - ((1.0 - probability) / 2.0)
|
||||
return norm_dist.ppf(left_tail_area), norm_dist.ppf(upper_area)
|
||||
|
||||
def margin_of_error(sample_size, standard_deviation, z_value):
|
||||
@@ -95,6 +95,14 @@ def margin_of_error(sample_size, standard_deviation, z_value):
|
||||
# How confident we are at a population metric given a sample (the interval we are "probability" sure the value will be)
|
||||
def confidence_interval(probability, sample_size, standard_deviation, z_value, mean):
|
||||
critical_z = generic_critical_z_value(probability)
|
||||
margin_error = margin_error(sample_size, standard_deviation, z_value)
|
||||
margin_error = margin_of_error(sample_size, standard_deviation, z_value)
|
||||
return mean + margin_error, mean - margin_error
|
||||
|
||||
## T Distribution
|
||||
## Similar to the normal distribution but made for smaller sample-sizes (30 or less)
|
||||
## When we get close to the 31 items, both are identical
|
||||
def get_critical_value_range_t(conficence_percentage: float, sample_size: int):
|
||||
untrusted_percentage = 1.0 - conficence_percentage
|
||||
lower = t.ppf(untrusted_percentage / 2, df=sample_size-1)
|
||||
upper = t.ppf(conficence_percentage + (untrusted_percentage / 2), df=sample_size-1)
|
||||
return (lower, upper)
|
||||
|
||||
Reference in New Issue
Block a user