Notice
Recent Posts
Recent Comments
Link
«   2026/06   »
1 2 3 4 5 6
7 8 9 10 11 12 13
14 15 16 17 18 19 20
21 22 23 24 25 26 27
28 29 30
Tags
more
Archives
Today
Total
관리 메뉴

statistics program

week 3. Descriptive Statistics _ 현주 본문

학습 정리/따봉콩쥐야고마워

week 3. Descriptive Statistics _ 현주

따봉콩쥐야고마워 2024. 6. 29. 19:13

 

ch03.ipynb
0.29MB

 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [ ]:
file = open('/content/drive/MyDrive/Colab Notebooks/adult.data', 'r')
def chr_int(a):
  if a.isdigit():
    return int(a)
  else:
    return 0

data = []
for line in file:
  data1 = line.split(',')
  if len(data1) == 15:
    data.append([chr_int(data1[0]), data1[1],
                chr_int(data1[2]), data1[3],
                chr_int(data1[4]), data1[5],
                data1[6],data1[7],data1[8],
                data1[9], chr_int(data1[10]),
                chr_int(data1[11]),
                chr_int(data1[12]),
                data1[13], data1[14]
                ])
In [ ]:
print(data[1:2])
[[50, ' Self-emp-not-inc', 0, ' Bachelors', 0, ' Married-civ-spouse', ' Exec-managerial', ' Husband', ' White', ' Male', 0, 0, 0, ' United-States', ' <=50K\n']]
In [ ]:
df = pd.DataFrame(data)
df.columns = [
    'age', 'type_employer', 'fnlwgt',
    'education', 'education_num', 'material',
    'occupation', 'relationship', 'race',
    'sex', 'capital_gain', 'capital_loss',
    'hr_per_week', 'country', 'income'
    ]
df
Out[ ]:
agetype_employerfnlwgteducationeducation_nummaterialoccupationrelationshipracesexcapital_gaincapital_losshr_per_weekcountryincome01234...3255632557325583255932560
39 State-gov 0 Bachelors 0 Never-married Adm-clerical Not-in-family White Male 0 0 0 United-States <=50K\n
50 Self-emp-not-inc 0 Bachelors 0 Married-civ-spouse Exec-managerial Husband White Male 0 0 0 United-States <=50K\n
38 Private 0 HS-grad 0 Divorced Handlers-cleaners Not-in-family White Male 0 0 0 United-States <=50K\n
53 Private 0 11th 0 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 0 United-States <=50K\n
28 Private 0 Bachelors 0 Married-civ-spouse Prof-specialty Wife Black Female 0 0 0 Cuba <=50K\n
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27 Private 0 Assoc-acdm 0 Married-civ-spouse Tech-support Wife White Female 0 0 0 United-States <=50K\n
40 Private 0 HS-grad 0 Married-civ-spouse Machine-op-inspct Husband White Male 0 0 0 United-States >50K\n
58 Private 0 HS-grad 0 Widowed Adm-clerical Unmarried White Female 0 0 0 United-States <=50K\n
22 Private 0 HS-grad 0 Never-married Adm-clerical Own-child White Male 0 0 0 United-States <=50K\n
52 Self-emp-inc 0 HS-grad 0 Married-civ-spouse Exec-managerial Wife White Female 0 0 0 United-States >50K\n

32561 rows × 15 columns

 
In [ ]:
df.shape
Out[ ]:
(32561, 15)
In [ ]:
counts = df.groupby('country').size()
print(counts.head())
country
 ?           583
 Cambodia     19
 Canada      121
 China        75
 Columbia     59
dtype: int64
In [ ]:
ml = df[df['sex'] == ' Male']
ml
Out[ ]:

 

39 State-gov 0 Bachelors 0 Never-married Adm-clerical Not-in-family White Male 0 0 0 United-States <=50K\n
50 Self-emp-not-inc 0 Bachelors 0 Married-civ-spouse Exec-managerial Husband White Male 0 0 0 United-States <=50K\n
38 Private 0 HS-grad 0 Divorced Handlers-cleaners Not-in-family White Male 0 0 0 United-States <=50K\n
53 Private 0 11th 0 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 0 United-States <=50K\n
52 Self-emp-not-inc 0 HS-grad 0 Married-civ-spouse Exec-managerial Husband White Male 0 0 0 United-States >50K\n
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
32 Private 0 Masters 0 Never-married Tech-support Not-in-family Asian-Pac-Islander Male 0 0 0 Taiwan <=50K\n
53 Private 0 Masters 0 Married-civ-spouse Exec-managerial Husband White Male 0 0 0 United-States >50K\n
22 Private 0 Some-college 0 Never-married Protective-serv Not-in-family White Male 0 0 0 United-States <=50K\n
40 Private 0 HS-grad 0 Married-civ-spouse Machine-op-inspct Husband White Male 0 0 0 United-States >50K\n
22 Private 0 HS-grad 0 Never-married Adm-clerical Own-child White Male 0 0 0 United-States <=50K\n

21790 rows × 15 columns

 
In [ ]:
ml1 = df[(df.sex == ' Male') & (df.income==' >50K\n')]
fm = df[(df.sex == ' Female')]
fm1 = df[(df.sex == ' Female') & (df.income==' >50K\n')]
In [ ]:
df1 = df[(df.income==' >50K\n')]
print('The rate of people with high income is: ',
      int(len(df1)/float(len(df))*100), '%.')
print('The rate of men with high income is: ',
      int(len(ml1)/float(len(ml))*100), '%.')
print('The rate of people with high income is: ',
      int(len(fm1)/float(len(fm))*100), '%.')
The rate of people with high income is:  24 %.
The rate of men with high income is:  30 %.
The rate of people with high income is:  10 %.
In [ ]:
print('The average age of men is: ',
      ml['age'].mean())
print('The average age of women is: ',
      fm['age'].mean())

print('The average age of high-income men is: ',
      ml1['age'].mean())
print('The average age of high-income women is: ',
      ml['age'].mean())
The average age of men is:  39.43354749885268
The average age of women is:  36.85823043357163
The average age of high-income men is:  44.62578805163614
The average age of high-income women is:  39.43354749885268
In [ ]:
ml_mu = ml['age'].mean()
fm_mu = fm['age'].mean()
ml_var = ml['age'].var()
fm_var = fm['age'].var()
ml_std = ml['age'].std()
fm_std = fm['age'].std()
print('Statistics of age for men: mu: ',
      ml_mu, 'var: ', ml_var, 'std: ', ml_std)
print('Statistics of age for women: mu: ',
      fm_mu, 'var: ', fm_var, 'std: ', fm_std)
Statistics of age for men: mu:  39.43354749885268 var:  178.77375174529985 std:  13.370630192526448
Statistics of age for women: mu:  36.85823043357163 var:  196.3837063948063 std:  14.013697099438332
In [ ]:
ml_median = ml['age'].median()
fm_median = fm['age'].median()
print("Median age per men and women: ", ml_median, fm_median)

ml_median_age = ml1['age'].median()
fm_median_age = fm1['age'].median()
print("Median age per men and women with high-income: ", ml_median_age, fm_median_age)
Median age per men and women:  38.0 35.0
Median age per men and women with high-income:  44.0 41.0
In [ ]:
ml_age = ml['age']
ml_age.hist(density = 0, histtype = 'stepfilled', bins = 20)
Out[ ]:
<Axes: >
In [ ]:
fm_age = fm['age']
fm_age.hist(density = 0, histtype = 'stepfilled', bins = 10)
Out[ ]:
<Axes: >
In [ ]:
import seaborn as sns
fm_age.hist(density = 0, histtype = 'stepfilled', alpha = .5, bins = 20)
ml_age.hist(density = 0, histtype = 'stepfilled', alpha = .5,
            color = sns.desaturate("indianred",.75), bins = 10)
Out[ ]:
<Axes: >
 
In [ ]:
fm_age.hist(density = 1, histtype = 'stepfilled', alpha = .5, bins = 20)
ml_age.hist(density = 1, histtype = 'stepfilled', alpha = .5, bins = 10,
            color = sns.desaturate("indianred",.75))
Out[ ]:
<Axes: >
 
In [ ]:
ml_age.hist(density = 1, histtype = 'step',
            cumulative = True, linewidth = 3.5,
            bins = 20)
fm_age.hist(density = 1, histtype = 'step',
            cumulative = True, linewidth = 3.5,
            bins = 20, color = sns.desaturate("indianred", .75))
Out[ ]:
<Axes: >
 
In [ ]:
df2 = df.drop(df.index[
    (df.income == '>50K/n') &
     (df['age'] > df['age'].median() + 35) &
      (df['age'] > df['age'].median() -15)])
ml1_age = ml1['age']
fm1_age = fm1['age']

ml2_age = ml1_age.drop(ml1_age.index[
    (ml1_age > df['age'].median() + 35) &
    (ml1_age > df['age'].median() - 15)])
fm2_age = fm1_age.drop(fm1_age.index[
    (fm1_age > df['age'].median() + 35) &
    (fm1_age > df['age'].median() - 15)])
In [ ]:
mu2ml = ml2_age.mean()
std2ml = ml2_age.std()
md2ml = ml2_age.median()
mu2fm = fm2_age.mean()
std2fm = fm2_age.std()
md2fm = fm2_age.median()

print("Men statistics: Mean:", mu2ml, "Std: ", std2ml, "Median: ", md2ml, "Min: ", ml2_age.min(), "Max: ", ml2_age.max())
print("Women statistics: Mean:", mu2fm, "Std: ", std2fm, "Median: ", md2fm, "Min: ", fm2_age.min(), "Max: ", fm2_age.max())
Men statistics: Mean: 44.317982123920615 Std:  10.019749857171409 Median:  44.0 Min:  19 Max:  72
Women statistics: Mean: 41.877028181041844 Std:  10.036441807343707 Median:  41.0 Min:  19 Max:  72
In [ ]:
plt.figure(figsize = (13.4, 5))
df.age[(df.income==' >50K\n')].plot(alpha = .25, color = 'blue')
df2.age[(df2.income==' >50K\n')].plot(alpha = .45, color = 'red')
Out[ ]:
<Axes: >
 
In [ ]:
print('The mean difference with outliers is: %4.2f.' % (ml_age.mean() - fm_age.mean()))
print('The mean difference without outliers is: %4.2f.' % (ml2_age.mean() - fm2_age.mean()))
The mean difference with outliers is: 2.58.
The mean difference without outliers is: 2.44.
In [ ]:
countx, divisionx = np.histogram(ml2_age, density = True)
county, divisiony = np.histogram(fm2_age, density = True)

val = [(divisionx[i] + divisionx[i+1])/2
       for i in range(len(divisionx) - 1)]
plt.plot(val, countx - county, 'o-')
Out[ ]:
[<matplotlib.lines.Line2D at 0x7e0889c07040>]
 
In [ ]:
def skewness(x):
  res = 0
  m = x.mean()
  s = x.std()
  for i in x:
    res += (i-m) * (i-m) * (i-m)
  res /= (len(x) * s * s * s)
  return res

print("Skewness of the male population = ", skewness (ml2_age))
print("Skewness of the female population = ", skewness (fm2_age))
Skewness of the male population =  0.26644438384328223
Skewness of the female population =  0.3863335249128606
In [ ]:
def pearson(x):
  return 3*(x.mean() - x.median())*x.std()

print("Pearson's coefficient of the male population = ",
      pearson(ml2_age))
print("Pearson's coefficient of the female population = ",
      pearson(fm2_age))
Pearson's coefficient of the male population =  9.558304022209926
Pearson's coefficient of the female population =  26.406726907280902
In [ ]:
from scipy.stats import norm

x1 = np.random.normal(-1, 0.5, 15)
x2 = np.random.normal(6, 1, 10)
y = np.r_[x1, x2]
x = np.linspace(min(y), max(y), 100)

s = 0.4

kernels = np.transpose([norm.pdf(x, yi, s) for yi in y])
plt.plot(x, kernels, 'k:')
plt.plot(x, kernels.sum(1), 'r')
plt.plot(y, np.zeros(len(y)), 'bo', ms = 10)
Out[ ]:
[<matplotlib.lines.Line2D at 0x7e08897a5d20>]
In [ ]:
from scipy.stats import kde
density = kde. gaussian_kde(y)
xgrid = np.linspace(x.min(), x.max(), 200)
plt.hist(y, bins = 28, density = True)
plt.plot(xgrid, density(xgrid), 'r-')
<ipython-input-62-8b9ee6cbe47d>:2: DeprecationWarning: Please use `gaussian_kde` from the `scipy.stats` namespace, the `scipy.stats.kde` namespace is deprecated.
  density = kde. gaussian_kde(y)
Out[ ]:
[<matplotlib.lines.Line2D at 0x7e08895a7dc0>]
 
In [ ]:
NTs = 200
mu = 0.0
var = 1.0
err = 0.0
NPs = 1000
for i in range(NTs) :
  x = np.random.normal(mu, var, NPs)
  err += (x.mean() - mu) ** 2
print('MSE: ', err / NTs)
MSE:  0.0009382668525155507