minor statistics definitions

This commit is contained in:
2025-12-19 00:24:16 +01:00
parent 170697c955
commit 893250e7b5

View File

@@ -1,6 +1,13 @@
## This module represents the third chapter of the book
## "Essential Math for Data Science" - Thomas Nield
## Chapter 3 - Statistics
from math import sqrt, pi, e, exp
from scipy.stats import norm
import random
import plotly.express as px
def mean(list):
return sum(list) / len(list)
@@ -73,6 +80,27 @@ def z_score(value, data_mean, std_deviation):
def coeficient_of_variation(std_deviation, mean):
return (std_deviation / mean)
def test_central_limit_theorem(sample_size, sample_count):
x_values = [(sum([random.uniform(0.0,1.0) for i in range(sample_size)]) / sample_size) for _ in range(sample_count)]
y_values = [1 for _ in range(sample_count)]
px.histogram(x=x_values, y=y_values, nbins=20).show()
def generic_critical_z_value(probability):
norm_dist = norm(loc=0.0, scale=1.0)
left_tail_area = (1.0 - p) / 2.0
upper_area = 1.0 - ((1.0 - p) / 2.0)
return norm_dist.ppf(left_tail_area), norm_dist.ppf(upper_area)
def margin_of_error(sample_size, standard_deviation, z_value):
return z_value * (standard_deviation / sqrt(sample_size)) # +-, we return the one provided by the z_value (tail or upper)
# How confident we are at a population metric given a sample (the interval we are "probability" sure the value will be)
def confidence_interval(probability, sample_size, standard_deviation, z_value, mean):
critical_z = generic_critical_z_value(probability)
margin_error = margin_error(sample_size, standard_deviation, z_value)
return mean + margin_error, mean - margin_error
def test_statistics_module():
print("=== Statistics module ===")
list = [ 1, 2, 3, 4, 5, 6]
@@ -100,3 +128,7 @@ def test_statistics_module():
print("The House A is much more expensive because its z-score is higher.")
print("The neighborhood of B has a coeficient of variation: {0}, and the one of A: {1}".format(coeficient_of_variation(3000, 140000), coeficient_of_variation(10000, 800000)))
print("This means that the neighborhood of A has more spread in its prices")
## Central limit theorem
test_central_limit_theorem(sample_size=1, sample_count=1000)
test_central_limit_theorem(sample_size=31, sample_count=1000)