From dd21fd19c68be2b656d79d780b5c86d887be33ab Mon Sep 17 00:00:00 2001 From: Daniel Heras Quesada Date: Sun, 22 Feb 2026 16:45:11 +0100 Subject: [PATCH] feat: statistics chapter done --- src/main.py | 7 +- .../examples/statistics_example.py | 94 +++++++++++++++++++ src/modules/essential_math/statistics.py | 16 +++- 3 files changed, 110 insertions(+), 7 deletions(-) diff --git a/src/main.py b/src/main.py index b1bb918..d0d7d39 100644 --- a/src/main.py +++ b/src/main.py @@ -3,11 +3,12 @@ import unittest from modules.essential_math.examples.statistics_example import ( normal_distribution_example, + normal_distribution_exercise, + t_distribution_example, basic_statistic_concepts_example, z_scores_example, + final_exercises ) if __name__=="__main__": - # basic_statistic_concepts_example() - # normal_distribution_example() - # z_scores_example() + final_exercises() diff --git a/src/modules/essential_math/examples/statistics_example.py b/src/modules/essential_math/examples/statistics_example.py index e31432f..1f0ef06 100644 --- a/src/modules/essential_math/examples/statistics_example.py +++ b/src/modules/essential_math/examples/statistics_example.py @@ -16,6 +16,7 @@ from modules.essential_math.statistics import ( generic_critical_z_value, margin_of_error, confidence_interval, + get_critical_value_range_t, ) def basic_statistic_concepts_example(): @@ -59,6 +60,55 @@ def normal_distribution_example(): expected_value = inverse_cumulative_density_function(target_probability, mean, std_dev); print(">> For a probability of .5 we expect the value: ", expected_value) +def normal_distribution_exercise(): + # Population with cold MEAN recovery time of 18 days, with std_dev of 1.5 days. + # Chances of recovery between 15 and 21 days + mean = 18 + std_dev = 1.5 + init = 15 + end = 21 + chances = normal_cumulative_density_function(end, mean, std_dev) - normal_cumulative_density_function(init, mean, std_dev) + print("Chances of recovering from a cold between 15 and 21 days: ", chances) + print("Chances of recovering before 15 days or after 21: ", 1.0 - chances) + # since its a normal distribution, the chances are equaly distributed + print("Chances of recovering before 15 days: ", (1.0 - chances) / 2) + + # Apply rug (or drug) to 40 people and see a 16 MEAN recovery time. Test if rug improved mean or casuality + ## One tailed atest: use inverse cdf in order to find the limit value for a given %. + new_mean = 16 + min_target_percentage = 0.05 # this is a standard + min_mean = inverse_cumulative_density_function(min_target_percentage, mean, std_dev) + if (min_mean < new_mean): + print("The rug (drug) did nothing.") + else: + print("The rug (drug) worked.") + ## One tailed test with a P value + p_value = normal_cumulative_density_function(new_mean, mean, std_dev) + if (p_value > min_target_percentage): + print("The rug (drug) did nothing.") + else: + print("The rug (drug) worked.") + + ## Two tailed test (look for both sides of the normal distribution) + ## Double the checks, harder to prove (x2) and checks if the rug(drug) makes the recovery time worse. + left_min_target = min_target_percentage / 2 + x1 = inverse_cumulative_density_function(left_min_target, mean, std_dev) + x2 = inverse_cumulative_density_function(1.0 - left_min_target, mean, std_dev) + if (new_mean < x1 or new_mean > x2): + print("The rug (drug) worked.", x1, x2) + else: + print("The rug (drug) did nothing.", x1, x2) + ## Two tailed test with a P value + p1 = normal_cumulative_density_function(new_mean, mean, std_dev) + right_symetrical_mean = mean + (mean - new_mean) + p2 = 1.0 - normal_cumulative_density_function(right_symetrical_mean, mean, std_dev) + p_value = p1 + p2 + if (p_value < min_target_percentage): + print("The rug (drug) worked.", p_value) + else: + print("The rug (drug) did nothing.", p_value) + #### CONCEPT: P-hacking, searching for data (in big data scenarios) that passes the p_value < 0.05 test and claiming for a relation. + def z_scores_example(): print("== Z-scores ==") @@ -72,3 +122,47 @@ def central_limit_theorem_example(): ## Central limit theorem test_central_limit_theorem(sample_size=1, sample_count=1000) test_central_limit_theorem(sample_size=31, sample_count=1000) + +def t_distribution_example(): + confidence = 0.95 + sample_size = 25 + (lower, upper) = get_critical_value_range_t(confidence, sample_size) + print("The confidence interval is: ", lower, upper) + +def final_exercises(): + # 1. + pool_widths = (1.78, 1.75, 1.72, 1.74, 1.77) + pool_width_mean = mean(pool_widths) + pool_width_std = standard_deviation(pool_widths, True) + print("1: ", pool_width_mean, pool_width_std) + + # 2. + z_mean = 42 + z_std_dev = 8 + z_prob_init = 20 + z_prob_end = 30 + z_prob_final = normal_cumulative_density_function(z_prob_end,z_mean,z_std_dev) - normal_cumulative_density_function(z_prob_init, z_mean, z_std_dev) + print("2: ", z_prob_final) + + # 3. + filament_value = 1.75 + filament_sample_size = 34 + filament_mean = 1.715588 + filament_std_dev = 0.029252 + filament_percentage_conficence = .99 + filament_z_value = z_score(filament_value,filament_mean, filament_std_dev) + (filament_confidence_init, filament_conficence_end) = confidence_interval(filament_percentage_conficence, filament_sample_size, filament_std_dev, filament_z_value, filament_mean) + print("3: ", filament_confidence_init, filament_conficence_end) + + # 4. + original_sales_average = 10345 + original_sales_std_dev = 552 + new_sales_average = 11641 + min_sales_percentage = 0.05 + + sales_p1 = 1.0 - normal_cumulative_density_function(new_sales_average, original_sales_average, original_sales_std_dev) + sales_p = sales_p1 * 2 # take advantage of symmetry + if (sales_p < min_sales_percentage): + print("The sales campaing worked", sales_p) + else: + print("The sales campaing did NOT work") diff --git a/src/modules/essential_math/statistics.py b/src/modules/essential_math/statistics.py index a963f9c..77ae442 100644 --- a/src/modules/essential_math/statistics.py +++ b/src/modules/essential_math/statistics.py @@ -3,7 +3,7 @@ ## Chapter 3 - Statistics from math import sqrt, pi, e, exp -from scipy.stats import norm +from scipy.stats import norm, t import random import plotly.express as px @@ -85,8 +85,8 @@ def test_central_limit_theorem(sample_size, sample_count): def generic_critical_z_value(probability): norm_dist = norm(loc=0.0, scale=1.0) - left_tail_area = (1.0 - p) / 2.0 - upper_area = 1.0 - ((1.0 - p) / 2.0) + left_tail_area = (1.0 - probability) / 2.0 + upper_area = 1.0 - ((1.0 - probability) / 2.0) return norm_dist.ppf(left_tail_area), norm_dist.ppf(upper_area) def margin_of_error(sample_size, standard_deviation, z_value): @@ -95,6 +95,14 @@ def margin_of_error(sample_size, standard_deviation, z_value): # How confident we are at a population metric given a sample (the interval we are "probability" sure the value will be) def confidence_interval(probability, sample_size, standard_deviation, z_value, mean): critical_z = generic_critical_z_value(probability) - margin_error = margin_error(sample_size, standard_deviation, z_value) + margin_error = margin_of_error(sample_size, standard_deviation, z_value) return mean + margin_error, mean - margin_error +## T Distribution +## Similar to the normal distribution but made for smaller sample-sizes (30 or less) +## When we get close to the 31 items, both are identical +def get_critical_value_range_t(conficence_percentage: float, sample_size: int): + untrusted_percentage = 1.0 - conficence_percentage + lower = t.ppf(untrusted_percentage / 2, df=sample_size-1) + upper = t.ppf(conficence_percentage + (untrusted_percentage / 2), df=sample_size-1) + return (lower, upper)