minor statistics definitions

2025-12-19 00:24:16 +01:00
parent 170697c955
commit 893250e7b5
1 changed files with 32 additions and 0 deletions
--- a/src/modules/statistics.py
+++ b/src/modules/statistics.py
@@ -1,6 +1,13 @@
 ## This module represents the third chapter of the book 
 ##  "Essential Math for Data Science" - Thomas Nield
 ##  Chapter 3 - Statistics 
 from math import sqrt, pi, e, exp
 from scipy.stats import norm
 import random
 import plotly.express as px
 def mean(list):
 	return sum(list) / len(list)
@@ -73,6 +80,27 @@ def z_score(value, data_mean, std_deviation):
 def coeficient_of_variation(std_deviation, mean):
 	return (std_deviation / mean)
 def test_central_limit_theorem(sample_size, sample_count):
 	x_values = [(sum([random.uniform(0.0,1.0) for i in range(sample_size)]) / sample_size) for _ in range(sample_count)]
 	y_values = [1 for _ in range(sample_count)]
 	px.histogram(x=x_values, y=y_values, nbins=20).show()
 def generic_critical_z_value(probability):
 	norm_dist = norm(loc=0.0, scale=1.0)
 	left_tail_area = (1.0 - p) / 2.0
 	upper_area = 1.0 - ((1.0 - p) / 2.0)
 	return norm_dist.ppf(left_tail_area), norm_dist.ppf(upper_area)
 def margin_of_error(sample_size, standard_deviation, z_value):
 	return z_value * (standard_deviation / sqrt(sample_size)) # +-, we return the one provided by the z_value (tail or upper)
 # How confident we are at a population metric given a sample (the interval we are "probability" sure the value will be)
 def confidence_interval(probability, sample_size, standard_deviation, z_value, mean):
 	critical_z = generic_critical_z_value(probability)
 	margin_error = margin_error(sample_size, standard_deviation, z_value)
 	return mean + margin_error, mean - margin_error
 def test_statistics_module():
 	print("=== Statistics module ===")
 	list = [ 1, 2, 3, 4, 5, 6]
@@ -100,3 +128,7 @@ def test_statistics_module():
 	print("The House A is much more expensive because its z-score is higher.")
 	print("The neighborhood of B has a coeficient of variation: {0}, and the one of A: {1}".format(coeficient_of_variation(3000, 140000), coeficient_of_variation(10000, 800000)))
 	print("This means that the neighborhood of A has more spread in its prices")
 	## Central limit theorem
 	test_central_limit_theorem(sample_size=1, sample_count=1000)
 	test_central_limit_theorem(sample_size=31, sample_count=1000)