minor statistics definitions
This commit is contained in:
@@ -1,6 +1,13 @@
|
|||||||
|
## This module represents the third chapter of the book
|
||||||
|
## "Essential Math for Data Science" - Thomas Nield
|
||||||
|
## Chapter 3 - Statistics
|
||||||
|
|
||||||
from math import sqrt, pi, e, exp
|
from math import sqrt, pi, e, exp
|
||||||
from scipy.stats import norm
|
from scipy.stats import norm
|
||||||
|
|
||||||
|
import random
|
||||||
|
import plotly.express as px
|
||||||
|
|
||||||
def mean(list):
|
def mean(list):
|
||||||
return sum(list) / len(list)
|
return sum(list) / len(list)
|
||||||
|
|
||||||
@@ -73,6 +80,27 @@ def z_score(value, data_mean, std_deviation):
|
|||||||
def coeficient_of_variation(std_deviation, mean):
|
def coeficient_of_variation(std_deviation, mean):
|
||||||
return (std_deviation / mean)
|
return (std_deviation / mean)
|
||||||
|
|
||||||
|
def test_central_limit_theorem(sample_size, sample_count):
|
||||||
|
x_values = [(sum([random.uniform(0.0,1.0) for i in range(sample_size)]) / sample_size) for _ in range(sample_count)]
|
||||||
|
y_values = [1 for _ in range(sample_count)]
|
||||||
|
px.histogram(x=x_values, y=y_values, nbins=20).show()
|
||||||
|
|
||||||
|
def generic_critical_z_value(probability):
|
||||||
|
norm_dist = norm(loc=0.0, scale=1.0)
|
||||||
|
left_tail_area = (1.0 - p) / 2.0
|
||||||
|
upper_area = 1.0 - ((1.0 - p) / 2.0)
|
||||||
|
return norm_dist.ppf(left_tail_area), norm_dist.ppf(upper_area)
|
||||||
|
|
||||||
|
def margin_of_error(sample_size, standard_deviation, z_value):
|
||||||
|
return z_value * (standard_deviation / sqrt(sample_size)) # +-, we return the one provided by the z_value (tail or upper)
|
||||||
|
|
||||||
|
# How confident we are at a population metric given a sample (the interval we are "probability" sure the value will be)
|
||||||
|
def confidence_interval(probability, sample_size, standard_deviation, z_value, mean):
|
||||||
|
critical_z = generic_critical_z_value(probability)
|
||||||
|
margin_error = margin_error(sample_size, standard_deviation, z_value)
|
||||||
|
return mean + margin_error, mean - margin_error
|
||||||
|
|
||||||
|
|
||||||
def test_statistics_module():
|
def test_statistics_module():
|
||||||
print("=== Statistics module ===")
|
print("=== Statistics module ===")
|
||||||
list = [ 1, 2, 3, 4, 5, 6]
|
list = [ 1, 2, 3, 4, 5, 6]
|
||||||
@@ -100,3 +128,7 @@ def test_statistics_module():
|
|||||||
print("The House A is much more expensive because its z-score is higher.")
|
print("The House A is much more expensive because its z-score is higher.")
|
||||||
print("The neighborhood of B has a coeficient of variation: {0}, and the one of A: {1}".format(coeficient_of_variation(3000, 140000), coeficient_of_variation(10000, 800000)))
|
print("The neighborhood of B has a coeficient of variation: {0}, and the one of A: {1}".format(coeficient_of_variation(3000, 140000), coeficient_of_variation(10000, 800000)))
|
||||||
print("This means that the neighborhood of A has more spread in its prices")
|
print("This means that the neighborhood of A has more spread in its prices")
|
||||||
|
|
||||||
|
## Central limit theorem
|
||||||
|
test_central_limit_theorem(sample_size=1, sample_count=1000)
|
||||||
|
test_central_limit_theorem(sample_size=31, sample_count=1000)
|
||||||
|
|||||||
Reference in New Issue
Block a user