minor statistics definitions

This commit is contained in:
2025-12-19 00:24:16 +01:00
parent 170697c955
commit 893250e7b5

View File

@@ -1,6 +1,13 @@
## This module represents the third chapter of the book
## "Essential Math for Data Science" - Thomas Nield
## Chapter 3 - Statistics
from math import sqrt, pi, e, exp from math import sqrt, pi, e, exp
from scipy.stats import norm from scipy.stats import norm
import random
import plotly.express as px
def mean(list): def mean(list):
return sum(list) / len(list) return sum(list) / len(list)
@@ -73,6 +80,27 @@ def z_score(value, data_mean, std_deviation):
def coeficient_of_variation(std_deviation, mean): def coeficient_of_variation(std_deviation, mean):
return (std_deviation / mean) return (std_deviation / mean)
def test_central_limit_theorem(sample_size, sample_count):
x_values = [(sum([random.uniform(0.0,1.0) for i in range(sample_size)]) / sample_size) for _ in range(sample_count)]
y_values = [1 for _ in range(sample_count)]
px.histogram(x=x_values, y=y_values, nbins=20).show()
def generic_critical_z_value(probability):
norm_dist = norm(loc=0.0, scale=1.0)
left_tail_area = (1.0 - p) / 2.0
upper_area = 1.0 - ((1.0 - p) / 2.0)
return norm_dist.ppf(left_tail_area), norm_dist.ppf(upper_area)
def margin_of_error(sample_size, standard_deviation, z_value):
return z_value * (standard_deviation / sqrt(sample_size)) # +-, we return the one provided by the z_value (tail or upper)
# How confident we are at a population metric given a sample (the interval we are "probability" sure the value will be)
def confidence_interval(probability, sample_size, standard_deviation, z_value, mean):
critical_z = generic_critical_z_value(probability)
margin_error = margin_error(sample_size, standard_deviation, z_value)
return mean + margin_error, mean - margin_error
def test_statistics_module(): def test_statistics_module():
print("=== Statistics module ===") print("=== Statistics module ===")
list = [ 1, 2, 3, 4, 5, 6] list = [ 1, 2, 3, 4, 5, 6]
@@ -100,3 +128,7 @@ def test_statistics_module():
print("The House A is much more expensive because its z-score is higher.") print("The House A is much more expensive because its z-score is higher.")
print("The neighborhood of B has a coeficient of variation: {0}, and the one of A: {1}".format(coeficient_of_variation(3000, 140000), coeficient_of_variation(10000, 800000))) print("The neighborhood of B has a coeficient of variation: {0}, and the one of A: {1}".format(coeficient_of_variation(3000, 140000), coeficient_of_variation(10000, 800000)))
print("This means that the neighborhood of A has more spread in its prices") print("This means that the neighborhood of A has more spread in its prices")
## Central limit theorem
test_central_limit_theorem(sample_size=1, sample_count=1000)
test_central_limit_theorem(sample_size=31, sample_count=1000)