feat: statistics chapter done
This commit is contained in:
@@ -3,11 +3,12 @@ import unittest
|
|||||||
|
|
||||||
from modules.essential_math.examples.statistics_example import (
|
from modules.essential_math.examples.statistics_example import (
|
||||||
normal_distribution_example,
|
normal_distribution_example,
|
||||||
|
normal_distribution_exercise,
|
||||||
|
t_distribution_example,
|
||||||
basic_statistic_concepts_example,
|
basic_statistic_concepts_example,
|
||||||
z_scores_example,
|
z_scores_example,
|
||||||
|
final_exercises
|
||||||
)
|
)
|
||||||
|
|
||||||
if __name__=="__main__":
|
if __name__=="__main__":
|
||||||
# basic_statistic_concepts_example()
|
final_exercises()
|
||||||
# normal_distribution_example()
|
|
||||||
# z_scores_example()
|
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ from modules.essential_math.statistics import (
|
|||||||
generic_critical_z_value,
|
generic_critical_z_value,
|
||||||
margin_of_error,
|
margin_of_error,
|
||||||
confidence_interval,
|
confidence_interval,
|
||||||
|
get_critical_value_range_t,
|
||||||
)
|
)
|
||||||
|
|
||||||
def basic_statistic_concepts_example():
|
def basic_statistic_concepts_example():
|
||||||
@@ -59,6 +60,55 @@ def normal_distribution_example():
|
|||||||
expected_value = inverse_cumulative_density_function(target_probability, mean, std_dev);
|
expected_value = inverse_cumulative_density_function(target_probability, mean, std_dev);
|
||||||
print(">> For a probability of .5 we expect the value: ", expected_value)
|
print(">> For a probability of .5 we expect the value: ", expected_value)
|
||||||
|
|
||||||
|
def normal_distribution_exercise():
|
||||||
|
# Population with cold MEAN recovery time of 18 days, with std_dev of 1.5 days.
|
||||||
|
# Chances of recovery between 15 and 21 days
|
||||||
|
mean = 18
|
||||||
|
std_dev = 1.5
|
||||||
|
init = 15
|
||||||
|
end = 21
|
||||||
|
chances = normal_cumulative_density_function(end, mean, std_dev) - normal_cumulative_density_function(init, mean, std_dev)
|
||||||
|
print("Chances of recovering from a cold between 15 and 21 days: ", chances)
|
||||||
|
print("Chances of recovering before 15 days or after 21: ", 1.0 - chances)
|
||||||
|
# since its a normal distribution, the chances are equaly distributed
|
||||||
|
print("Chances of recovering before 15 days: ", (1.0 - chances) / 2)
|
||||||
|
|
||||||
|
# Apply rug (or drug) to 40 people and see a 16 MEAN recovery time. Test if rug improved mean or casuality
|
||||||
|
## One tailed atest: use inverse cdf in order to find the limit value for a given %.
|
||||||
|
new_mean = 16
|
||||||
|
min_target_percentage = 0.05 # this is a standard
|
||||||
|
min_mean = inverse_cumulative_density_function(min_target_percentage, mean, std_dev)
|
||||||
|
if (min_mean < new_mean):
|
||||||
|
print("The rug (drug) did nothing.")
|
||||||
|
else:
|
||||||
|
print("The rug (drug) worked.")
|
||||||
|
## One tailed test with a P value
|
||||||
|
p_value = normal_cumulative_density_function(new_mean, mean, std_dev)
|
||||||
|
if (p_value > min_target_percentage):
|
||||||
|
print("The rug (drug) did nothing.")
|
||||||
|
else:
|
||||||
|
print("The rug (drug) worked.")
|
||||||
|
|
||||||
|
## Two tailed test (look for both sides of the normal distribution)
|
||||||
|
## Double the checks, harder to prove (x2) and checks if the rug(drug) makes the recovery time worse.
|
||||||
|
left_min_target = min_target_percentage / 2
|
||||||
|
x1 = inverse_cumulative_density_function(left_min_target, mean, std_dev)
|
||||||
|
x2 = inverse_cumulative_density_function(1.0 - left_min_target, mean, std_dev)
|
||||||
|
if (new_mean < x1 or new_mean > x2):
|
||||||
|
print("The rug (drug) worked.", x1, x2)
|
||||||
|
else:
|
||||||
|
print("The rug (drug) did nothing.", x1, x2)
|
||||||
|
## Two tailed test with a P value
|
||||||
|
p1 = normal_cumulative_density_function(new_mean, mean, std_dev)
|
||||||
|
right_symetrical_mean = mean + (mean - new_mean)
|
||||||
|
p2 = 1.0 - normal_cumulative_density_function(right_symetrical_mean, mean, std_dev)
|
||||||
|
p_value = p1 + p2
|
||||||
|
if (p_value < min_target_percentage):
|
||||||
|
print("The rug (drug) worked.", p_value)
|
||||||
|
else:
|
||||||
|
print("The rug (drug) did nothing.", p_value)
|
||||||
|
#### CONCEPT: P-hacking, searching for data (in big data scenarios) that passes the p_value < 0.05 test and claiming for a relation.
|
||||||
|
|
||||||
def z_scores_example():
|
def z_scores_example():
|
||||||
print("== Z-scores ==")
|
print("== Z-scores ==")
|
||||||
|
|
||||||
@@ -72,3 +122,47 @@ def central_limit_theorem_example():
|
|||||||
## Central limit theorem
|
## Central limit theorem
|
||||||
test_central_limit_theorem(sample_size=1, sample_count=1000)
|
test_central_limit_theorem(sample_size=1, sample_count=1000)
|
||||||
test_central_limit_theorem(sample_size=31, sample_count=1000)
|
test_central_limit_theorem(sample_size=31, sample_count=1000)
|
||||||
|
|
||||||
|
def t_distribution_example():
|
||||||
|
confidence = 0.95
|
||||||
|
sample_size = 25
|
||||||
|
(lower, upper) = get_critical_value_range_t(confidence, sample_size)
|
||||||
|
print("The confidence interval is: ", lower, upper)
|
||||||
|
|
||||||
|
def final_exercises():
|
||||||
|
# 1.
|
||||||
|
pool_widths = (1.78, 1.75, 1.72, 1.74, 1.77)
|
||||||
|
pool_width_mean = mean(pool_widths)
|
||||||
|
pool_width_std = standard_deviation(pool_widths, True)
|
||||||
|
print("1: ", pool_width_mean, pool_width_std)
|
||||||
|
|
||||||
|
# 2.
|
||||||
|
z_mean = 42
|
||||||
|
z_std_dev = 8
|
||||||
|
z_prob_init = 20
|
||||||
|
z_prob_end = 30
|
||||||
|
z_prob_final = normal_cumulative_density_function(z_prob_end,z_mean,z_std_dev) - normal_cumulative_density_function(z_prob_init, z_mean, z_std_dev)
|
||||||
|
print("2: ", z_prob_final)
|
||||||
|
|
||||||
|
# 3.
|
||||||
|
filament_value = 1.75
|
||||||
|
filament_sample_size = 34
|
||||||
|
filament_mean = 1.715588
|
||||||
|
filament_std_dev = 0.029252
|
||||||
|
filament_percentage_conficence = .99
|
||||||
|
filament_z_value = z_score(filament_value,filament_mean, filament_std_dev)
|
||||||
|
(filament_confidence_init, filament_conficence_end) = confidence_interval(filament_percentage_conficence, filament_sample_size, filament_std_dev, filament_z_value, filament_mean)
|
||||||
|
print("3: ", filament_confidence_init, filament_conficence_end)
|
||||||
|
|
||||||
|
# 4.
|
||||||
|
original_sales_average = 10345
|
||||||
|
original_sales_std_dev = 552
|
||||||
|
new_sales_average = 11641
|
||||||
|
min_sales_percentage = 0.05
|
||||||
|
|
||||||
|
sales_p1 = 1.0 - normal_cumulative_density_function(new_sales_average, original_sales_average, original_sales_std_dev)
|
||||||
|
sales_p = sales_p1 * 2 # take advantage of symmetry
|
||||||
|
if (sales_p < min_sales_percentage):
|
||||||
|
print("The sales campaing worked", sales_p)
|
||||||
|
else:
|
||||||
|
print("The sales campaing did NOT work")
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
## Chapter 3 - Statistics
|
## Chapter 3 - Statistics
|
||||||
|
|
||||||
from math import sqrt, pi, e, exp
|
from math import sqrt, pi, e, exp
|
||||||
from scipy.stats import norm
|
from scipy.stats import norm, t
|
||||||
|
|
||||||
import random
|
import random
|
||||||
import plotly.express as px
|
import plotly.express as px
|
||||||
@@ -85,8 +85,8 @@ def test_central_limit_theorem(sample_size, sample_count):
|
|||||||
|
|
||||||
def generic_critical_z_value(probability):
|
def generic_critical_z_value(probability):
|
||||||
norm_dist = norm(loc=0.0, scale=1.0)
|
norm_dist = norm(loc=0.0, scale=1.0)
|
||||||
left_tail_area = (1.0 - p) / 2.0
|
left_tail_area = (1.0 - probability) / 2.0
|
||||||
upper_area = 1.0 - ((1.0 - p) / 2.0)
|
upper_area = 1.0 - ((1.0 - probability) / 2.0)
|
||||||
return norm_dist.ppf(left_tail_area), norm_dist.ppf(upper_area)
|
return norm_dist.ppf(left_tail_area), norm_dist.ppf(upper_area)
|
||||||
|
|
||||||
def margin_of_error(sample_size, standard_deviation, z_value):
|
def margin_of_error(sample_size, standard_deviation, z_value):
|
||||||
@@ -95,6 +95,14 @@ def margin_of_error(sample_size, standard_deviation, z_value):
|
|||||||
# How confident we are at a population metric given a sample (the interval we are "probability" sure the value will be)
|
# How confident we are at a population metric given a sample (the interval we are "probability" sure the value will be)
|
||||||
def confidence_interval(probability, sample_size, standard_deviation, z_value, mean):
|
def confidence_interval(probability, sample_size, standard_deviation, z_value, mean):
|
||||||
critical_z = generic_critical_z_value(probability)
|
critical_z = generic_critical_z_value(probability)
|
||||||
margin_error = margin_error(sample_size, standard_deviation, z_value)
|
margin_error = margin_of_error(sample_size, standard_deviation, z_value)
|
||||||
return mean + margin_error, mean - margin_error
|
return mean + margin_error, mean - margin_error
|
||||||
|
|
||||||
|
## T Distribution
|
||||||
|
## Similar to the normal distribution but made for smaller sample-sizes (30 or less)
|
||||||
|
## When we get close to the 31 items, both are identical
|
||||||
|
def get_critical_value_range_t(conficence_percentage: float, sample_size: int):
|
||||||
|
untrusted_percentage = 1.0 - conficence_percentage
|
||||||
|
lower = t.ppf(untrusted_percentage / 2, df=sample_size-1)
|
||||||
|
upper = t.ppf(conficence_percentage + (untrusted_percentage / 2), df=sample_size-1)
|
||||||
|
return (lower, upper)
|
||||||
|
|||||||
Reference in New Issue
Block a user