statistics program
week 3. Descriptive Statistics _ 현주 본문

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [ ]:
file = open('/content/drive/MyDrive/Colab Notebooks/adult.data', 'r')
def chr_int(a):
if a.isdigit():
return int(a)
else:
return 0
data = []
for line in file:
data1 = line.split(',')
if len(data1) == 15:
data.append([chr_int(data1[0]), data1[1],
chr_int(data1[2]), data1[3],
chr_int(data1[4]), data1[5],
data1[6],data1[7],data1[8],
data1[9], chr_int(data1[10]),
chr_int(data1[11]),
chr_int(data1[12]),
data1[13], data1[14]
])
In [ ]:
print(data[1:2])
[[50, ' Self-emp-not-inc', 0, ' Bachelors', 0, ' Married-civ-spouse', ' Exec-managerial', ' Husband', ' White', ' Male', 0, 0, 0, ' United-States', ' <=50K\n']]
In [ ]:
df = pd.DataFrame(data)
df.columns = [
'age', 'type_employer', 'fnlwgt',
'education', 'education_num', 'material',
'occupation', 'relationship', 'race',
'sex', 'capital_gain', 'capital_loss',
'hr_per_week', 'country', 'income'
]
df
Out[ ]:
agetype_employerfnlwgteducationeducation_nummaterialoccupationrelationshipracesexcapital_gaincapital_losshr_per_weekcountryincome01234...3255632557325583255932560
| 39 | State-gov | 0 | Bachelors | 0 | Never-married | Adm-clerical | Not-in-family | White | Male | 0 | 0 | 0 | United-States | <=50K\n |
| 50 | Self-emp-not-inc | 0 | Bachelors | 0 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 0 | United-States | <=50K\n |
| 38 | Private | 0 | HS-grad | 0 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 0 | United-States | <=50K\n |
| 53 | Private | 0 | 11th | 0 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0 | 0 | 0 | United-States | <=50K\n |
| 28 | Private | 0 | Bachelors | 0 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 0 | 0 | 0 | Cuba | <=50K\n |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27 | Private | 0 | Assoc-acdm | 0 | Married-civ-spouse | Tech-support | Wife | White | Female | 0 | 0 | 0 | United-States | <=50K\n |
| 40 | Private | 0 | HS-grad | 0 | Married-civ-spouse | Machine-op-inspct | Husband | White | Male | 0 | 0 | 0 | United-States | >50K\n |
| 58 | Private | 0 | HS-grad | 0 | Widowed | Adm-clerical | Unmarried | White | Female | 0 | 0 | 0 | United-States | <=50K\n |
| 22 | Private | 0 | HS-grad | 0 | Never-married | Adm-clerical | Own-child | White | Male | 0 | 0 | 0 | United-States | <=50K\n |
| 52 | Self-emp-inc | 0 | HS-grad | 0 | Married-civ-spouse | Exec-managerial | Wife | White | Female | 0 | 0 | 0 | United-States | >50K\n |
32561 rows × 15 columns
In [ ]:
df.shape
Out[ ]:
(32561, 15)
In [ ]:
counts = df.groupby('country').size()
print(counts.head())
country
? 583
Cambodia 19
Canada 121
China 75
Columbia 59
dtype: int64
In [ ]:
ml = df[df['sex'] == ' Male']
ml
Out[ ]:
| 39 | State-gov | 0 | Bachelors | 0 | Never-married | Adm-clerical | Not-in-family | White | Male | 0 | 0 | 0 | United-States | <=50K\n |
| 50 | Self-emp-not-inc | 0 | Bachelors | 0 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 0 | United-States | <=50K\n |
| 38 | Private | 0 | HS-grad | 0 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 0 | United-States | <=50K\n |
| 53 | Private | 0 | 11th | 0 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0 | 0 | 0 | United-States | <=50K\n |
| 52 | Self-emp-not-inc | 0 | HS-grad | 0 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 0 | United-States | >50K\n |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 32 | Private | 0 | Masters | 0 | Never-married | Tech-support | Not-in-family | Asian-Pac-Islander | Male | 0 | 0 | 0 | Taiwan | <=50K\n |
| 53 | Private | 0 | Masters | 0 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 0 | United-States | >50K\n |
| 22 | Private | 0 | Some-college | 0 | Never-married | Protective-serv | Not-in-family | White | Male | 0 | 0 | 0 | United-States | <=50K\n |
| 40 | Private | 0 | HS-grad | 0 | Married-civ-spouse | Machine-op-inspct | Husband | White | Male | 0 | 0 | 0 | United-States | >50K\n |
| 22 | Private | 0 | HS-grad | 0 | Never-married | Adm-clerical | Own-child | White | Male | 0 | 0 | 0 | United-States | <=50K\n |
21790 rows × 15 columns
In [ ]:
ml1 = df[(df.sex == ' Male') & (df.income==' >50K\n')]
fm = df[(df.sex == ' Female')]
fm1 = df[(df.sex == ' Female') & (df.income==' >50K\n')]
In [ ]:
df1 = df[(df.income==' >50K\n')]
print('The rate of people with high income is: ',
int(len(df1)/float(len(df))*100), '%.')
print('The rate of men with high income is: ',
int(len(ml1)/float(len(ml))*100), '%.')
print('The rate of people with high income is: ',
int(len(fm1)/float(len(fm))*100), '%.')
The rate of people with high income is: 24 %.
The rate of men with high income is: 30 %.
The rate of people with high income is: 10 %.
In [ ]:
print('The average age of men is: ',
ml['age'].mean())
print('The average age of women is: ',
fm['age'].mean())
print('The average age of high-income men is: ',
ml1['age'].mean())
print('The average age of high-income women is: ',
ml['age'].mean())
The average age of men is: 39.43354749885268
The average age of women is: 36.85823043357163
The average age of high-income men is: 44.62578805163614
The average age of high-income women is: 39.43354749885268
In [ ]:
ml_mu = ml['age'].mean()
fm_mu = fm['age'].mean()
ml_var = ml['age'].var()
fm_var = fm['age'].var()
ml_std = ml['age'].std()
fm_std = fm['age'].std()
print('Statistics of age for men: mu: ',
ml_mu, 'var: ', ml_var, 'std: ', ml_std)
print('Statistics of age for women: mu: ',
fm_mu, 'var: ', fm_var, 'std: ', fm_std)
Statistics of age for men: mu: 39.43354749885268 var: 178.77375174529985 std: 13.370630192526448
Statistics of age for women: mu: 36.85823043357163 var: 196.3837063948063 std: 14.013697099438332
In [ ]:
ml_median = ml['age'].median()
fm_median = fm['age'].median()
print("Median age per men and women: ", ml_median, fm_median)
ml_median_age = ml1['age'].median()
fm_median_age = fm1['age'].median()
print("Median age per men and women with high-income: ", ml_median_age, fm_median_age)
Median age per men and women: 38.0 35.0
Median age per men and women with high-income: 44.0 41.0
In [ ]:
ml_age = ml['age']
ml_age.hist(density = 0, histtype = 'stepfilled', bins = 20)
Out[ ]:
<Axes: >

In [ ]:
fm_age = fm['age']
fm_age.hist(density = 0, histtype = 'stepfilled', bins = 10)
Out[ ]:
<Axes: >

In [ ]:
import seaborn as sns
fm_age.hist(density = 0, histtype = 'stepfilled', alpha = .5, bins = 20)
ml_age.hist(density = 0, histtype = 'stepfilled', alpha = .5,
color = sns.desaturate("indianred",.75), bins = 10)
Out[ ]:
<Axes: >
In [ ]:
fm_age.hist(density = 1, histtype = 'stepfilled', alpha = .5, bins = 20)
ml_age.hist(density = 1, histtype = 'stepfilled', alpha = .5, bins = 10,
color = sns.desaturate("indianred",.75))
Out[ ]:
<Axes: >
In [ ]:
ml_age.hist(density = 1, histtype = 'step',
cumulative = True, linewidth = 3.5,
bins = 20)
fm_age.hist(density = 1, histtype = 'step',
cumulative = True, linewidth = 3.5,
bins = 20, color = sns.desaturate("indianred", .75))
Out[ ]:
<Axes: >
In [ ]:
df2 = df.drop(df.index[
(df.income == '>50K/n') &
(df['age'] > df['age'].median() + 35) &
(df['age'] > df['age'].median() -15)])
ml1_age = ml1['age']
fm1_age = fm1['age']
ml2_age = ml1_age.drop(ml1_age.index[
(ml1_age > df['age'].median() + 35) &
(ml1_age > df['age'].median() - 15)])
fm2_age = fm1_age.drop(fm1_age.index[
(fm1_age > df['age'].median() + 35) &
(fm1_age > df['age'].median() - 15)])
In [ ]:
mu2ml = ml2_age.mean()
std2ml = ml2_age.std()
md2ml = ml2_age.median()
mu2fm = fm2_age.mean()
std2fm = fm2_age.std()
md2fm = fm2_age.median()
print("Men statistics: Mean:", mu2ml, "Std: ", std2ml, "Median: ", md2ml, "Min: ", ml2_age.min(), "Max: ", ml2_age.max())
print("Women statistics: Mean:", mu2fm, "Std: ", std2fm, "Median: ", md2fm, "Min: ", fm2_age.min(), "Max: ", fm2_age.max())
Men statistics: Mean: 44.317982123920615 Std: 10.019749857171409 Median: 44.0 Min: 19 Max: 72
Women statistics: Mean: 41.877028181041844 Std: 10.036441807343707 Median: 41.0 Min: 19 Max: 72
In [ ]:
plt.figure(figsize = (13.4, 5))
df.age[(df.income==' >50K\n')].plot(alpha = .25, color = 'blue')
df2.age[(df2.income==' >50K\n')].plot(alpha = .45, color = 'red')
Out[ ]:
<Axes: >
In [ ]:
print('The mean difference with outliers is: %4.2f.' % (ml_age.mean() - fm_age.mean()))
print('The mean difference without outliers is: %4.2f.' % (ml2_age.mean() - fm2_age.mean()))
The mean difference with outliers is: 2.58.
The mean difference without outliers is: 2.44.
In [ ]:
countx, divisionx = np.histogram(ml2_age, density = True)
county, divisiony = np.histogram(fm2_age, density = True)
val = [(divisionx[i] + divisionx[i+1])/2
for i in range(len(divisionx) - 1)]
plt.plot(val, countx - county, 'o-')
Out[ ]:
[<matplotlib.lines.Line2D at 0x7e0889c07040>]
In [ ]:
def skewness(x):
res = 0
m = x.mean()
s = x.std()
for i in x:
res += (i-m) * (i-m) * (i-m)
res /= (len(x) * s * s * s)
return res
print("Skewness of the male population = ", skewness (ml2_age))
print("Skewness of the female population = ", skewness (fm2_age))
Skewness of the male population = 0.26644438384328223
Skewness of the female population = 0.3863335249128606
In [ ]:
def pearson(x):
return 3*(x.mean() - x.median())*x.std()
print("Pearson's coefficient of the male population = ",
pearson(ml2_age))
print("Pearson's coefficient of the female population = ",
pearson(fm2_age))
Pearson's coefficient of the male population = 9.558304022209926
Pearson's coefficient of the female population = 26.406726907280902
In [ ]:
from scipy.stats import norm
x1 = np.random.normal(-1, 0.5, 15)
x2 = np.random.normal(6, 1, 10)
y = np.r_[x1, x2]
x = np.linspace(min(y), max(y), 100)
s = 0.4
kernels = np.transpose([norm.pdf(x, yi, s) for yi in y])
plt.plot(x, kernels, 'k:')
plt.plot(x, kernels.sum(1), 'r')
plt.plot(y, np.zeros(len(y)), 'bo', ms = 10)
Out[ ]:
[<matplotlib.lines.Line2D at 0x7e08897a5d20>]

In [ ]:
from scipy.stats import kde
density = kde. gaussian_kde(y)
xgrid = np.linspace(x.min(), x.max(), 200)
plt.hist(y, bins = 28, density = True)
plt.plot(xgrid, density(xgrid), 'r-')
<ipython-input-62-8b9ee6cbe47d>:2: DeprecationWarning: Please use `gaussian_kde` from the `scipy.stats` namespace, the `scipy.stats.kde` namespace is deprecated.
density = kde. gaussian_kde(y)
Out[ ]:
[<matplotlib.lines.Line2D at 0x7e08895a7dc0>]
In [ ]:
NTs = 200
mu = 0.0
var = 1.0
err = 0.0
NPs = 1000
for i in range(NTs) :
x = np.random.normal(mu, var, NPs)
err += (x.mean() - mu) ** 2
print('MSE: ', err / NTs)
MSE: 0.0009382668525155507
'학습 정리 > 따봉콩쥐야고마워' 카테고리의 다른 글
| week 5. Regression Analysis _ 현주 (0) | 2024.06.29 |
|---|---|
| week 4. Statistical Inference _ 현주 (0) | 2024.06.29 |
| week 2. Toolboxes for Data Scientists _ 현주 (0) | 2024.06.29 |
| week 1 . 초기 환경 설정 _ 현주 (0) | 2024.05.23 |