feat: statistics chapter done

2026-02-22 16:45:11 +01:00
parent 106205935e
commit dd21fd19c6
3 changed files with 110 additions and 7 deletions
--- a/src/main.py
+++ b/src/main.py
@@ -3,11 +3,12 @@ import unittest

 from modules.essential_math.examples.statistics_example import (
 	normal_distribution_example,
+	normal_distribution_exercise,
+	t_distribution_example,
 	basic_statistic_concepts_example,
 	z_scores_example,
+	final_exercises
 )

 if __name__=="__main__":
-	# basic_statistic_concepts_example()
-	# normal_distribution_example()
-	# z_scores_example()
+	final_exercises()
--- a/src/modules/essential_math/examples/statistics_example.py
+++ b/src/modules/essential_math/examples/statistics_example.py
@@ -16,6 +16,7 @@ from modules.essential_math.statistics import (
 	generic_critical_z_value,
 	margin_of_error,
 	confidence_interval,
+	get_critical_value_range_t,
 )

 def basic_statistic_concepts_example():
@@ -59,6 +60,55 @@ def normal_distribution_example():
 	expected_value = inverse_cumulative_density_function(target_probability, mean, std_dev);
 	print(">> For a probability of .5 we expect the value: ", expected_value)

+def normal_distribution_exercise():
+	# Population with cold MEAN recovery time of 18 days, with std_dev of 1.5 days. 
+	# Chances of recovery between 15 and 21 days
+	mean = 18
+	std_dev = 1.5
+	init = 15
+	end = 21
+	chances = normal_cumulative_density_function(end, mean, std_dev) - normal_cumulative_density_function(init, mean, std_dev)
+	print("Chances of recovering from a cold between 15 and 21 days: ", chances)
+	print("Chances of recovering before 15 days or after 21: ", 1.0 - chances)
+	# since its a normal distribution, the chances are equaly distributed
+	print("Chances of recovering before 15 days: ", (1.0 - chances) / 2)
+
+	# Apply rug (or drug) to 40 people and see a 16 MEAN recovery time. Test if rug improved mean or casuality
+	## One tailed atest: use inverse cdf in order to find the limit value for a given %.
+	new_mean = 16
+	min_target_percentage = 0.05 # this is a standard
+	min_mean = inverse_cumulative_density_function(min_target_percentage, mean, std_dev)
+	if (min_mean < new_mean):
+		print("The rug (drug) did nothing.")
+	else:
+		print("The rug (drug) worked.")
+	## One tailed test with a P value
+	p_value = normal_cumulative_density_function(new_mean, mean, std_dev)
+	if (p_value > min_target_percentage):
+		print("The rug (drug) did nothing.")
+	else:
+		print("The rug (drug) worked.")
+	
+	## Two tailed test (look for both sides of the normal distribution)
+	## Double the checks, harder to prove (x2) and checks if the rug(drug) makes the recovery time worse.
+	left_min_target = min_target_percentage / 2	
+	x1 = inverse_cumulative_density_function(left_min_target, mean, std_dev)
+	x2 = inverse_cumulative_density_function(1.0 - left_min_target, mean, std_dev)
+	if (new_mean < x1 or new_mean > x2):
+		print("The rug (drug) worked.", x1, x2)
+	else:
+		print("The rug (drug) did nothing.", x1, x2)
+	## Two tailed test with a P value
+	p1 = normal_cumulative_density_function(new_mean, mean, std_dev)
+	right_symetrical_mean = mean + (mean - new_mean)
+	p2 = 1.0 - normal_cumulative_density_function(right_symetrical_mean, mean, std_dev)
+	p_value = p1 + p2
+	if (p_value < min_target_percentage):
+		print("The rug (drug) worked.", p_value)
+	else:
+		print("The rug (drug) did nothing.", p_value)
+	#### CONCEPT: P-hacking, searching for data (in big data scenarios) that passes the p_value < 0.05 test and claiming for a relation.
+
 def z_scores_example():
 	print("== Z-scores ==")

@@ -72,3 +122,47 @@ def central_limit_theorem_example():
 	## Central limit theorem
 	test_central_limit_theorem(sample_size=1, sample_count=1000)
 	test_central_limit_theorem(sample_size=31, sample_count=1000)
+
+def t_distribution_example():
+	confidence = 0.95
+	sample_size = 25
+	(lower, upper) = get_critical_value_range_t(confidence, sample_size)
+	print("The confidence interval is: ", lower, upper)
+
+def final_exercises():
+	# 1.
+	pool_widths = (1.78, 1.75, 1.72, 1.74, 1.77)
+	pool_width_mean = mean(pool_widths)
+	pool_width_std = standard_deviation(pool_widths, True)
+	print("1: ", pool_width_mean, pool_width_std)
+
+	# 2. 
+	z_mean = 42
+	z_std_dev = 8
+	z_prob_init = 20
+	z_prob_end = 30
+	z_prob_final = normal_cumulative_density_function(z_prob_end,z_mean,z_std_dev) - normal_cumulative_density_function(z_prob_init, z_mean, z_std_dev)
+	print("2: ", z_prob_final)
+
+	# 3.
+	filament_value = 1.75
+	filament_sample_size = 34
+	filament_mean = 1.715588
+	filament_std_dev = 0.029252
+	filament_percentage_conficence = .99
+	filament_z_value = z_score(filament_value,filament_mean, filament_std_dev)
+	(filament_confidence_init, filament_conficence_end) = confidence_interval(filament_percentage_conficence, filament_sample_size, filament_std_dev, filament_z_value, filament_mean)
+	print("3: ", filament_confidence_init, filament_conficence_end)
+
+	# 4.
+	original_sales_average = 10345
+	original_sales_std_dev = 552
+	new_sales_average = 11641
+	min_sales_percentage = 0.05
+
+	sales_p1 = 1.0 - normal_cumulative_density_function(new_sales_average, original_sales_average, original_sales_std_dev)
+	sales_p = sales_p1 * 2 # take advantage of symmetry
+	if (sales_p < min_sales_percentage):
+		print("The sales campaing worked", sales_p)
+	else:
+		print("The sales campaing did NOT work")
--- a/src/modules/essential_math/statistics.py
+++ b/src/modules/essential_math/statistics.py
@@ -3,7 +3,7 @@
 ##  Chapter 3 - Statistics 

 from math import sqrt, pi, e, exp
-from scipy.stats import norm
+from scipy.stats import norm, t

 import random
 import plotly.express as px
@@ -85,8 +85,8 @@ def test_central_limit_theorem(sample_size, sample_count):

 def generic_critical_z_value(probability):
 	norm_dist = norm(loc=0.0, scale=1.0)
-	left_tail_area = (1.0 - p) / 2.0
-	upper_area = 1.0 - ((1.0 - p) / 2.0)
+	left_tail_area = (1.0 - probability) / 2.0
+	upper_area = 1.0 - ((1.0 - probability) / 2.0)
 	return norm_dist.ppf(left_tail_area), norm_dist.ppf(upper_area)

 def margin_of_error(sample_size, standard_deviation, z_value):
@@ -95,6 +95,14 @@ def margin_of_error(sample_size, standard_deviation, z_value):
 # How confident we are at a population metric given a sample (the interval we are "probability" sure the value will be)
 def confidence_interval(probability, sample_size, standard_deviation, z_value, mean):
 	critical_z = generic_critical_z_value(probability)
-	margin_error = margin_error(sample_size, standard_deviation, z_value)
+	margin_error = margin_of_error(sample_size, standard_deviation, z_value)
 	return mean + margin_error, mean - margin_error

+## T Distribution
+## Similar to the normal distribution but made for smaller sample-sizes (30 or less)
+## When we get close to the 31 items, both are identical
+def get_critical_value_range_t(conficence_percentage: float, sample_size: int):
+	untrusted_percentage = 1.0 - conficence_percentage	
+	lower = t.ppf(untrusted_percentage / 2, df=sample_size-1)
+	upper = t.ppf(conficence_percentage + (untrusted_percentage / 2), df=sample_size-1)
+	return (lower, upper)