feat: statistics chapter done

2026-02-22 16:45:11 +01:00
parent 106205935e
commit dd21fd19c6
3 changed files with 110 additions and 7 deletions
--- a/src/main.py
+++ b/src/main.py
@@ -3,11 +3,12 @@ import unittest
 from modules.essential_math.examples.statistics_example import (
 	normal_distribution_example,
 	normal_distribution_exercise,
 	t_distribution_example,
 	basic_statistic_concepts_example,
 	z_scores_example,
 	final_exercises
 )
 if __name__=="__main__":
-	# basic_statistic_concepts_example()
+	final_exercises()
 	# normal_distribution_example()
 	# z_scores_example()
--- a/src/modules/essential_math/examples/statistics_example.py
+++ b/src/modules/essential_math/examples/statistics_example.py
@@ -16,6 +16,7 @@ from modules.essential_math.statistics import (
 	generic_critical_z_value,
 	margin_of_error,
 	confidence_interval,
 	get_critical_value_range_t,
 )
 def basic_statistic_concepts_example():
@@ -59,6 +60,55 @@ def normal_distribution_example():
 	expected_value = inverse_cumulative_density_function(target_probability, mean, std_dev);
 	print(">> For a probability of .5 we expect the value: ", expected_value)
 def normal_distribution_exercise():
 	# Population with cold MEAN recovery time of 18 days, with std_dev of 1.5 days. 
 	# Chances of recovery between 15 and 21 days
 	mean = 18
 	std_dev = 1.5
 	init = 15
 	end = 21
 	chances = normal_cumulative_density_function(end, mean, std_dev) - normal_cumulative_density_function(init, mean, std_dev)
 	print("Chances of recovering from a cold between 15 and 21 days: ", chances)
 	print("Chances of recovering before 15 days or after 21: ", 1.0 - chances)
 	# since its a normal distribution, the chances are equaly distributed
 	print("Chances of recovering before 15 days: ", (1.0 - chances) / 2)
 	# Apply rug (or drug) to 40 people and see a 16 MEAN recovery time. Test if rug improved mean or casuality
 	## One tailed atest: use inverse cdf in order to find the limit value for a given %.
 	new_mean = 16
 	min_target_percentage = 0.05 # this is a standard
 	min_mean = inverse_cumulative_density_function(min_target_percentage, mean, std_dev)
 	if (min_mean < new_mean):
 		print("The rug (drug) did nothing.")
 	else:
 		print("The rug (drug) worked.")
 	## One tailed test with a P value
 	p_value = normal_cumulative_density_function(new_mean, mean, std_dev)
 	if (p_value > min_target_percentage):
 		print("The rug (drug) did nothing.")
 	else:
 		print("The rug (drug) worked.")
 	## Two tailed test (look for both sides of the normal distribution)
 	## Double the checks, harder to prove (x2) and checks if the rug(drug) makes the recovery time worse.
 	left_min_target = min_target_percentage / 2	
 	x1 = inverse_cumulative_density_function(left_min_target, mean, std_dev)
 	x2 = inverse_cumulative_density_function(1.0 - left_min_target, mean, std_dev)
 	if (new_mean < x1 or new_mean > x2):
 		print("The rug (drug) worked.", x1, x2)
 	else:
 		print("The rug (drug) did nothing.", x1, x2)
 	## Two tailed test with a P value
 	p1 = normal_cumulative_density_function(new_mean, mean, std_dev)
 	right_symetrical_mean = mean + (mean - new_mean)
 	p2 = 1.0 - normal_cumulative_density_function(right_symetrical_mean, mean, std_dev)
 	p_value = p1 + p2
 	if (p_value < min_target_percentage):
 		print("The rug (drug) worked.", p_value)
 	else:
 		print("The rug (drug) did nothing.", p_value)
 	#### CONCEPT: P-hacking, searching for data (in big data scenarios) that passes the p_value < 0.05 test and claiming for a relation.
 def z_scores_example():
 	print("== Z-scores ==")
@@ -72,3 +122,47 @@ def central_limit_theorem_example():
 	## Central limit theorem
 	test_central_limit_theorem(sample_size=1, sample_count=1000)
 	test_central_limit_theorem(sample_size=31, sample_count=1000)
 def t_distribution_example():
 	confidence = 0.95
 	sample_size = 25
 	(lower, upper) = get_critical_value_range_t(confidence, sample_size)
 	print("The confidence interval is: ", lower, upper)
 def final_exercises():
 	# 1.
 	pool_widths = (1.78, 1.75, 1.72, 1.74, 1.77)
 	pool_width_mean = mean(pool_widths)
 	pool_width_std = standard_deviation(pool_widths, True)
 	print("1: ", pool_width_mean, pool_width_std)
 	# 2. 
 	z_mean = 42
 	z_std_dev = 8
 	z_prob_init = 20
 	z_prob_end = 30
 	z_prob_final = normal_cumulative_density_function(z_prob_end,z_mean,z_std_dev) - normal_cumulative_density_function(z_prob_init, z_mean, z_std_dev)
 	print("2: ", z_prob_final)
 	# 3.
 	filament_value = 1.75
 	filament_sample_size = 34
 	filament_mean = 1.715588
 	filament_std_dev = 0.029252
 	filament_percentage_conficence = .99
 	filament_z_value = z_score(filament_value,filament_mean, filament_std_dev)
 	(filament_confidence_init, filament_conficence_end) = confidence_interval(filament_percentage_conficence, filament_sample_size, filament_std_dev, filament_z_value, filament_mean)
 	print("3: ", filament_confidence_init, filament_conficence_end)
 	# 4.
 	original_sales_average = 10345
 	original_sales_std_dev = 552
 	new_sales_average = 11641
 	min_sales_percentage = 0.05
 	sales_p1 = 1.0 - normal_cumulative_density_function(new_sales_average, original_sales_average, original_sales_std_dev)
 	sales_p = sales_p1 * 2 # take advantage of symmetry
 	if (sales_p < min_sales_percentage):
 		print("The sales campaing worked", sales_p)
 	else:
 		print("The sales campaing did NOT work")
--- a/src/modules/essential_math/statistics.py
+++ b/src/modules/essential_math/statistics.py
@@ -3,7 +3,7 @@
 ##  Chapter 3 - Statistics 
 from math import sqrt, pi, e, exp
-from scipy.stats import norm
+from scipy.stats import norm, t
 import random
 import plotly.express as px
@@ -85,8 +85,8 @@ def test_central_limit_theorem(sample_size, sample_count):
 def generic_critical_z_value(probability):
 	norm_dist = norm(loc=0.0, scale=1.0)
-	left_tail_area = (1.0 - p) / 2.0
+	left_tail_area = (1.0 - probability) / 2.0
-	upper_area = 1.0 - ((1.0 - p) / 2.0)
+	upper_area = 1.0 - ((1.0 - probability) / 2.0)
 	return norm_dist.ppf(left_tail_area), norm_dist.ppf(upper_area)
 def margin_of_error(sample_size, standard_deviation, z_value):
@@ -95,6 +95,14 @@ def margin_of_error(sample_size, standard_deviation, z_value):
 # How confident we are at a population metric given a sample (the interval we are "probability" sure the value will be)
 def confidence_interval(probability, sample_size, standard_deviation, z_value, mean):
 	critical_z = generic_critical_z_value(probability)
-	margin_error = margin_error(sample_size, standard_deviation, z_value)
+	margin_error = margin_of_error(sample_size, standard_deviation, z_value)
 	return mean + margin_error, mean - margin_error
 ## T Distribution
 ## Similar to the normal distribution but made for smaller sample-sizes (30 or less)
 ## When we get close to the 31 items, both are identical
 def get_critical_value_range_t(conficence_percentage: float, sample_size: int):
 	untrusted_percentage = 1.0 - conficence_percentage	
 	lower = t.ppf(untrusted_percentage / 2, df=sample_size-1)
 	upper = t.ppf(conficence_percentage + (untrusted_percentage / 2), df=sample_size-1)
 	return (lower, upper)