From 106205935e2dbd83ebba5c8977b873224a5a466e Mon Sep 17 00:00:00 2001 From: Daniel Heras Quesada Date: Thu, 19 Feb 2026 17:42:57 +0100 Subject: [PATCH] refactor: module organization --- src/main.py | 21 +-- .../examples/statistics_example.py | 74 ++++++++++ src/modules/{ => essential_math}/math.py | 0 .../{ => essential_math}/probability.py | 0 src/modules/essential_math/statistics.py | 100 +++++++++++++ src/modules/{ => primitives}/strings.py | 0 src/modules/statistics.py | 134 ------------------ 7 files changed, 181 insertions(+), 148 deletions(-) create mode 100644 src/modules/essential_math/examples/statistics_example.py rename src/modules/{ => essential_math}/math.py (100%) rename src/modules/{ => essential_math}/probability.py (100%) create mode 100644 src/modules/essential_math/statistics.py rename src/modules/{ => primitives}/strings.py (100%) delete mode 100644 src/modules/statistics.py diff --git a/src/main.py b/src/main.py index ce2d640..b1bb918 100644 --- a/src/main.py +++ b/src/main.py @@ -1,20 +1,13 @@ from sympy import diff, limit, oo, symbols import unittest -from modules.math import ( - test_math_module +from modules.essential_math.examples.statistics_example import ( + normal_distribution_example, + basic_statistic_concepts_example, + z_scores_example, ) -from modules.probability import ( - test_probability_module -) -from modules.statistics import ( - test_statistics_module -) -from modules.strings import t_strings if __name__=="__main__": - # t_strings() - # test_math_module() - # test_probability_module() - test_statistics_module() - # test_exercises_module() + # basic_statistic_concepts_example() + # normal_distribution_example() + # z_scores_example() diff --git a/src/modules/essential_math/examples/statistics_example.py b/src/modules/essential_math/examples/statistics_example.py new file mode 100644 index 0000000..e31432f --- /dev/null +++ b/src/modules/essential_math/examples/statistics_example.py @@ -0,0 +1,74 @@ +from modules.essential_math.statistics import ( + mean, + median, + weighted_mean, + weighted_mean_inline, + population_variance, + population_variance_inline, + sample_variance, + standard_deviation, + normal_probability_density_function, + normal_cumulative_density_function, + inverse_cumulative_density_function, + z_score, + coeficient_of_variation, + test_central_limit_theorem, + generic_critical_z_value, + margin_of_error, + confidence_interval, +) + +def basic_statistic_concepts_example(): + print("=== Statistics module ===") + + list = [ 1, 2, 3, 4, 5, 6] + print(">> The mean of {0} is {1}".format(list, mean(list))) + + weights = [0.2, 0.5, 0.7, 1, 0, 0.9] + print(">> The weighted_mean of {0} is {1} and it is equivalent to {2}".format(list, weighted_mean(list, weights), weighted_mean_inline(list, weights))) + + print(">> The median is {0}".format(median(list))) + + values = [ 0, 1, 5, 7, 9, 10, 14] + _population_variance = population_variance(values, sum(values) / len(values)) + population_variance_calc_inline = population_variance_inline(values); + print("The population variance is", _population_variance, population_variance_calc_inline) + + std_dev = standard_deviation(values, False) + print("The standard deviation is", std_dev) + + sample = values.copy() + del sample[3] + del sample[1] + print("The sample variance for a population is", sample_variance(sample)) + print("The standard deviation for a population is", standard_deviation(sample, True)) + +def normal_distribution_example(): + print("== Normal distribution ==") + + values = [ 0, 1, 5, 7, 9, 10, 14] + mean = sum(values) / len(values) + std_dev = standard_deviation(values, False) + target_x = 1 + + print(">> The probability_density_function for x = 1 over the example data is {0}".format(normal_probability_density_function(target_x, mean, std_dev))) + + print(">> The probability for observing a value smaller than 1 is given by the cumulative density function and it is: {0}".format(normal_cumulative_density_function(target_x, mean, std_dev))) + + target_probability = 0.5 + expected_value = inverse_cumulative_density_function(target_probability, mean, std_dev); + print(">> For a probability of .5 we expect the value: ", expected_value) + +def z_scores_example(): + print("== Z-scores ==") + + print("A house (A) of 150K in a neighborhood of 140K mean and 3K std_dev has a Z-score: {0}".format(z_score(150000, 140000, 3000))) + print("A house (B) of 815K in a neighborhood of 800K mean and 10K std_dev has a Z-score: {0}".format(z_score(815000, 800000, 10000))) + print("The House A is much more expensive because its z-score is higher.") + print("The neighborhood of B has a coeficient of variation: {0}, and the one of A: {1}".format(coeficient_of_variation(3000, 140000), coeficient_of_variation(10000, 800000))) + print("This means that the neighborhood of A has more spread in its prices") + +def central_limit_theorem_example(): + ## Central limit theorem + test_central_limit_theorem(sample_size=1, sample_count=1000) + test_central_limit_theorem(sample_size=31, sample_count=1000) diff --git a/src/modules/math.py b/src/modules/essential_math/math.py similarity index 100% rename from src/modules/math.py rename to src/modules/essential_math/math.py diff --git a/src/modules/probability.py b/src/modules/essential_math/probability.py similarity index 100% rename from src/modules/probability.py rename to src/modules/essential_math/probability.py diff --git a/src/modules/essential_math/statistics.py b/src/modules/essential_math/statistics.py new file mode 100644 index 0000000..a963f9c --- /dev/null +++ b/src/modules/essential_math/statistics.py @@ -0,0 +1,100 @@ +## This module represents the third chapter of the book +## "Essential Math for Data Science" - Thomas Nield +## Chapter 3 - Statistics + +from math import sqrt, pi, e, exp +from scipy.stats import norm + +import random +import plotly.express as px + +def mean(list): + return sum(list) / len(list) + +def weighted_mean(items, weights): + if (len(items) != len(weights)): + return + total = 0 + for i in range(len(items)): + total += items[i] * weights[i] + return total / sum(weights) + +def weighted_mean_inline(items, weights): + return sum(s * w for s, w in zip(items, weights)) / sum(weights) +# also called 50% quantile +def median(items): + ordered = sorted(items) + length = len(ordered) + pair = length % 2 == 0 + mid = int(length / 2) - 1 if pair else int(n/2) + + if pair: + return (ordered[mid] + ordered[mid+1]) / 2 + else: + return ordered[mid] + +def mode(items): + sums = [] + +def population_variance(value_list, mean): + summatory = 0.0 + for value in value_list: + summatory += (value - mean) ** 2 + return summatory / len(value_list) + +def population_variance_inline(value_list): + return sum((v - (sum(value_list) / len(value_list))) ** 2 for v in value_list) / len(value_list) + +def sample_variance(value_list): + mean = sum(value_list) / len(value_list) + return sum((value - mean) ** 2 for value in value_list) / (len(value_list) - 1) + +def population_standard_deviation(value_list): + return sqrt(population_variance_inline(value_list)) + +def sample_standard_deviation(value_list): + return sqrt(sample_variance(value_list)) + +def standard_deviation(value_list, is_sample): + return sample_standard_deviation(value_list) if is_sample else population_standard_deviation(value_list) + +## Normal distribution +# PDF generates the Normal Distribution (symetric arround the mean) +def normal_probability_density_function(x: float, mean: float, standard_deviation: float): + return (1.0 / (2.0 * pi * standard_deviation ** 2) ** 0.5) * exp(-1.0 * ((x - mean) ** 2 / (2.0 * standard_deviation ** 2))) + +def normal_cumulative_density_function(x, mean, std_deviation): + return norm.cdf(x, mean, std_deviation) + +# Check exected value for a given probability +def inverse_cumulative_density_function(prob, mean, std_dev): + x = norm.ppf(prob, mean, std_dev) + return x + +# Z-scores are valuable in order to normalize 2 pieces of data +def z_score(value, data_mean, std_deviation): + return (value - data_mean) / std_deviation + +def coeficient_of_variation(std_deviation, mean): + return (std_deviation / mean) + +def test_central_limit_theorem(sample_size, sample_count): + x_values = [(sum([random.uniform(0.0,1.0) for i in range(sample_size)]) / sample_size) for _ in range(sample_count)] + y_values = [1 for _ in range(sample_count)] + px.histogram(x=x_values, y=y_values, nbins=20).show() + +def generic_critical_z_value(probability): + norm_dist = norm(loc=0.0, scale=1.0) + left_tail_area = (1.0 - p) / 2.0 + upper_area = 1.0 - ((1.0 - p) / 2.0) + return norm_dist.ppf(left_tail_area), norm_dist.ppf(upper_area) + +def margin_of_error(sample_size, standard_deviation, z_value): + return z_value * (standard_deviation / sqrt(sample_size)) # +-, we return the one provided by the z_value (tail or upper) + +# How confident we are at a population metric given a sample (the interval we are "probability" sure the value will be) +def confidence_interval(probability, sample_size, standard_deviation, z_value, mean): + critical_z = generic_critical_z_value(probability) + margin_error = margin_error(sample_size, standard_deviation, z_value) + return mean + margin_error, mean - margin_error + diff --git a/src/modules/strings.py b/src/modules/primitives/strings.py similarity index 100% rename from src/modules/strings.py rename to src/modules/primitives/strings.py diff --git a/src/modules/statistics.py b/src/modules/statistics.py deleted file mode 100644 index becb530..0000000 --- a/src/modules/statistics.py +++ /dev/null @@ -1,134 +0,0 @@ -## This module represents the third chapter of the book -## "Essential Math for Data Science" - Thomas Nield -## Chapter 3 - Statistics - -from math import sqrt, pi, e, exp -from scipy.stats import norm - -import random -import plotly.express as px - -def mean(list): - return sum(list) / len(list) - -def weighted_mean(items, weights): - if (len(items) != len(weights)): - return - total = 0 - for i in range(len(items)): - total += items[i] * weights[i] - return total / sum(weights) - -def weighted_mean_inline(items, weights): - return sum(s * w for s, w in zip(items, weights)) / sum(weights) - -# also called 50% quantile -def median(items): - ordered = sorted(items) - length = len(ordered) - pair = length % 2 == 0 - mid = int(length / 2) - 1 if pair else int(n/2) - - if pair: - return (ordered[mid] + ordered[mid+1]) / 2 - else: - return ordered[mid] - -def mode(items): - sums = [] - -def population_variance(difference_list, mean): - summatory = 0.0 - for diff in difference_list: - summatory += (diff - mean) ** 2 - return summatory / len(difference_list) - -def population_variance_inline(difference_list): - return sum((v - (sum(difference_list) / len(difference_list))) ** 2 for v in difference_list) / len(difference_list) - -def sample_variance(difference_list): - mean = sum(difference_list) / len(difference_list) - return sum((diff - mean) ** 2 for diff in difference_list) / (len(difference_list) - 1) - -def population_standard_deviation(difference_list): - return sqrt(population_variance_inline(difference_list)) - -def sample_standard_deviation(difference_list): - return sqrt(sample_variance(difference_list)) - -def standard_deviation(difference_list, is_sample): - return sample_standard_deviation(difference_list) if is_sample else population_standard_deviation(difference_list) - -## Normal distribution -# PDF generates the Normal Distribution (symetric arround the mean) -def normal_probability_density_function(x: float, mean: float, standard_deviation: float): - return (1.0 / (2.0 * pi * standard_deviation ** 2) ** 0.5) * exp(-1.0 * ((x - mean) ** 2 / (2.0 * standard_deviation ** 2))) - -def normal_cumulative_density_function(x, mean, difference_list): - std_dev = standard_deviation(difference_list, False) - return norm.cdf(x, mean, std_dev) - -# Check exected value for a given probability -def inverse_cumulative_density_function(prob, mean, std_dev): - x = norm.ppf(prob, mean, std_dev) - return x - -# Z-scores are valuable in order to normalize 2 pieces of data -def z_score(value, data_mean, std_deviation): - return (value - data_mean) / std_deviation - -def coeficient_of_variation(std_deviation, mean): - return (std_deviation / mean) - -def test_central_limit_theorem(sample_size, sample_count): - x_values = [(sum([random.uniform(0.0,1.0) for i in range(sample_size)]) / sample_size) for _ in range(sample_count)] - y_values = [1 for _ in range(sample_count)] - px.histogram(x=x_values, y=y_values, nbins=20).show() - -def generic_critical_z_value(probability): - norm_dist = norm(loc=0.0, scale=1.0) - left_tail_area = (1.0 - p) / 2.0 - upper_area = 1.0 - ((1.0 - p) / 2.0) - return norm_dist.ppf(left_tail_area), norm_dist.ppf(upper_area) - -def margin_of_error(sample_size, standard_deviation, z_value): - return z_value * (standard_deviation / sqrt(sample_size)) # +-, we return the one provided by the z_value (tail or upper) - -# How confident we are at a population metric given a sample (the interval we are "probability" sure the value will be) -def confidence_interval(probability, sample_size, standard_deviation, z_value, mean): - critical_z = generic_critical_z_value(probability) - margin_error = margin_error(sample_size, standard_deviation, z_value) - return mean + margin_error, mean - margin_error - - -def test_statistics_module(): - print("=== Statistics module ===") - list = [ 1, 2, 3, 4, 5, 6] - print(">> The mean of {0} is {1}".format(list, mean(list))) - weights = [0.2, 0.5, 0.7, 1, 0, 0.9] - print(">> The weighted_mean of {0} is {1} and it is equivalent to {2}".format(list, weighted_mean(list, weights), weighted_mean_inline(list, weights))) - print(">> The mean is {0}".format(median(list))) - - differences = [ -6.571, -5.571, -1.571, 0.429, 2.429, 3.429, 7.429 ] - print("The population variance is", population_variance(differences, sum(differences) / len(differences)), population_variance_inline(differences)) - print("The standard deviation is", standard_deviation(differences, False)) - sample = differences.copy() - del sample[3] - del sample[1] - print("The sample variance for a population is", sample_variance(sample)) - print("The standard deviation for a population is", standard_deviation(sample, True)) - - print("== Normal distribution ==") - print(">> The probability_density_function for x = 1 over the example data is {0}".format(normal_probability_density_function(1, sum(differences) / len(differences), standard_deviation(differences, False)))) - print(">> The probability for observing a value smaller than 1 is given by the cumulative density function and it is: {0}".format(normal_cumulative_density_function(1, sum(differences) / len(differences), differences))) - - print("== Z-scores ==") - print("A house (A) of 150K in a neighborhood of 140K mean and 3K std_dev has a Z-score: {0}".format(z_score(150000, 140000, 3000))) - print("A house (B) of 815K in a neighborhood of 800K mean and 10K std_dev has a Z-score: {0}".format(z_score(815000, 800000, 10000))) - print("The House A is much more expensive because its z-score is higher.") - print("The neighborhood of B has a coeficient of variation: {0}, and the one of A: {1}".format(coeficient_of_variation(3000, 140000), coeficient_of_variation(10000, 800000))) - print("This means that the neighborhood of A has more spread in its prices") - - ## Central limit theorem - test_central_limit_theorem(sample_size=1, sample_count=1000) - test_central_limit_theorem(sample_size=31, sample_count=1000)