statistics program
week 2. Toolboxes for Data Scientists _ 현주 본문

In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [ ]:
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
In [ ]:
data = { 'year' : [
2010, 2011, 2012,
2010, 2011, 2012,
2010, 2011, 2012
],
'team' : [
'FCBarcelona', 'FCBarcelona',
'FCBarcelona', 'RMadrid',
'RMadrid', 'RMadrid',
'ValenciaCF', 'ValenciaCF',
'ValenciaCF'
],
'wins': [30,28,32,29,32,26,21,17,19],
'draws': [6,7,4,5,4,7,8,10,8],
'losses': [2,3,2,4,2,5,9,11,11]
}
football = pd.DataFrame(data, columns= [
'year', 'team', 'wins', 'draws', 'losses'
]
)
football
Out[ ]:
yearteamwinsdrawslosses012345678
| 2010 | FCBarcelona | 30 | 6 | 2 |
| 2011 | FCBarcelona | 28 | 7 | 3 |
| 2012 | FCBarcelona | 32 | 4 | 2 |
| 2010 | RMadrid | 29 | 5 | 4 |
| 2011 | RMadrid | 32 | 4 | 2 |
| 2012 | RMadrid | 26 | 7 | 5 |
| 2010 | ValenciaCF | 21 | 8 | 9 |
| 2011 | ValenciaCF | 17 | 10 | 11 |
| 2012 | ValenciaCF | 19 | 8 | 11 |
In [ ]:
edu = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/educ_figdp_1_Data.csv',
na_values = ':',
usecols = ["TIME","GEO","Value"])
In [ ]:
edu.head()
Out[ ]:
TIMEGEOValue01234
| 2000 | European Union (28 countries) | NaN |
| 2001 | European Union (28 countries) | NaN |
| 2002 | European Union (28 countries) | 5.00 |
| 2003 | European Union (28 countries) | 5.03 |
| 2004 | European Union (28 countries) | 4.95 |
In [ ]:
edu.tail()
Out[ ]:
TIMEGEOValue379380381382383
| 2007 | Finland | 5.90 |
| 2008 | Finland | 6.10 |
| 2009 | Finland | 6.81 |
| 2010 | Finland | 6.85 |
| 2011 | Finland | 6.76 |
In [ ]:
edu.describe()
Out[ ]:
TIMEValuecountmeanstdmin25%50%75%max
| 384.000000 | 361.000000 |
| 2005.500000 | 5.203989 |
| 3.456556 | 1.021694 |
| 2000.000000 | 2.880000 |
| 2002.750000 | 4.620000 |
| 2005.500000 | 5.060000 |
| 2008.250000 | 5.660000 |
| 2011.000000 | 8.810000 |
In [ ]:
edu['Value']
Out[ ]:
0 NaN
1 NaN
2 5.00
3 5.03
4 4.95
...
379 5.90
380 6.10
381 6.81
382 6.85
383 6.76
Name: Value, Length: 384, dtype: float64
In [ ]:
edu[10:14]
Out[ ]:
TIMEGEOValue10111213
| 2010 | European Union (28 countries) | 5.41 |
| 2011 | European Union (28 countries) | 5.25 |
| 2000 | European Union (27 countries) | 4.91 |
| 2001 | European Union (27 countries) | 4.99 |
In [ ]:
edu.loc[90:94, ['TIME','GEO']]
Out[ ]:
TIMEGEO9091929394
| 2006 | Belgium |
| 2007 | Belgium |
| 2008 | Belgium |
| 2009 | Belgium |
| 2010 | Belgium |
In [ ]:
edu[edu['Value'] > 6.5].tail()
Out[ ]:
TIMEGEOValue286287381382383
| 2010 | Malta | 6.74 |
| 2011 | Malta | 7.96 |
| 2009 | Finland | 6.81 |
| 2010 | Finland | 6.85 |
| 2011 | Finland | 6.76 |
In [ ]:
edu[edu["Value"].isnull()].head()
Out[ ]:
TIMEGEOValue01363748
| 2000 | European Union (28 countries) | NaN |
| 2001 | European Union (28 countries) | NaN |
| 2000 | Euro area (18 countries) | NaN |
| 2001 | Euro area (18 countries) | NaN |
| 2000 | Euro area (17 countries) | NaN |
In [ ]:
edu.max(axis = 0)
Out[ ]:
TIME 2011
GEO Spain
Value 8.81
dtype: object
In [ ]:
print("Pandas max function: ", edu['Value'].max())
print("Python max function: ", max(edu['Value']))
Pandas max function: 8.81
Python max function: nan
In [ ]:
s = edu["Value"]/100
s.head()
Out[ ]:
0 NaN
1 NaN
2 0.0500
3 0.0503
4 0.0495
Name: Value, dtype: float64
In [ ]:
s = edu["Value"].apply(np.sqrt)
s.head()
Out[ ]:
0 NaN
1 NaN
2 2.236068
3 2.242766
4 2.224860
Name: Value, dtype: float64
In [ ]:
s = edu["Value"].apply(lambda d: d**2)
s.head()
Out[ ]:
0 NaN
1 NaN
2 25.0000
3 25.3009
4 24.5025
Name: Value, dtype: float64
In [ ]:
edu["ValueNorm"] = edu["Value"]/edu["Value"].max()
edu.tail()
Out[ ]:
TIMEGEOValueValueNorm379380381382383
| 2007 | Finland | 5.90 | 0.669694 |
| 2008 | Finland | 6.10 | 0.692395 |
| 2009 | Finland | 6.81 | 0.772985 |
| 2010 | Finland | 6.85 | 0.777526 |
| 2011 | Finland | 6.76 | 0.767310 |
In [ ]:
edu.drop('ValueNorm', axis = 1, inplace = True)
edu.head()
Out[ ]:
TIMEGEOValue01234
| 2000 | European Union (28 countries) | NaN |
| 2001 | European Union (28 countries) | NaN |
| 2002 | European Union (28 countries) | 5.00 |
| 2003 | European Union (28 countries) | 5.03 |
| 2004 | European Union (28 countries) | 4.95 |
In [ ]:
edu = edu.append({"TIME": 2000, "Value": 5.00, "GEO": 'a'},
ignore_index = True)
edu.tail()
<ipython-input-28-2372007df89b>:1: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
edu = edu.append({"TIME": 2000, "Value": 5.00, "GEO": 'a'},
Out[ ]:
TIMEGEOValue380381382383384
| 2008 | Finland | 6.10 |
| 2009 | Finland | 6.81 |
| 2010 | Finland | 6.85 |
| 2011 | Finland | 6.76 |
| 2000 | a | 5.00 |
In [ ]:
edu.drop(max(edu.index), axis = 0, inplace = True)
edu.tail()
Out[ ]:
TIMEGEOValue379380381382383
| 2007 | Finland | 5.90 |
| 2008 | Finland | 6.10 |
| 2009 | Finland | 6.81 |
| 2010 | Finland | 6.85 |
| 2011 | Finland | 6.76 |
In [ ]:
eduDrop = edu.drop(edu[edu["Value"].isnull()].index, axis=0)
eduDrop.head()
Out[ ]:
TIMEGEOValue23456
| 2002 | European Union (28 countries) | 5.00 |
| 2003 | European Union (28 countries) | 5.03 |
| 2004 | European Union (28 countries) | 4.95 |
| 2005 | European Union (28 countries) | 4.92 |
| 2006 | European Union (28 countries) | 4.91 |
In [ ]:
eduDrop = edu.dropna(how = 'any', subset = ["Value"])
eduDrop.head()
Out[ ]:
TIMEGEOValue23456
| 2002 | European Union (28 countries) | 5.00 |
| 2003 | European Union (28 countries) | 5.03 |
| 2004 | European Union (28 countries) | 4.95 |
| 2005 | European Union (28 countries) | 4.92 |
| 2006 | European Union (28 countries) | 4.91 |
In [ ]:
eduFilled = edu.fillna(value = {"Value":0})
eduFilled.head()
Out[ ]:
TIMEGEOValue01234
| 2000 | European Union (28 countries) | 0.00 |
| 2001 | European Union (28 countries) | 0.00 |
| 2002 | European Union (28 countries) | 5.00 |
| 2003 | European Union (28 countries) | 5.03 |
| 2004 | European Union (28 countries) | 4.95 |
In [ ]:
edu.sort_values(by = 'Value', ascending = False,
inplace = True)
edu.head()
Out[ ]:
TIMEGEOValue130131129121122
| 2010 | Denmark | 8.81 |
| 2011 | Denmark | 8.75 |
| 2009 | Denmark | 8.74 |
| 2001 | Denmark | 8.44 |
| 2002 | Denmark | 8.44 |
In [ ]:
edu.sort_index(axis = 0, ascending = True, inplace = True)
edu.head()
Out[ ]:
TIMEGEOValue01234
| 2000 | European Union (28 countries) | NaN |
| 2001 | European Union (28 countries) | NaN |
| 2002 | European Union (28 countries) | 5.00 |
| 2003 | European Union (28 countries) | 5.03 |
| 2004 | European Union (28 countries) | 4.95 |
In [ ]:
group = edu[["GEO", "Value"]].groupby('GEO').mean()
group.head()
Out[ ]:
ValueGEOAustriaBelgiumBulgariaCyprusCzech Republic
| 5.618333 |
| 6.189091 |
| 4.093333 |
| 7.023333 |
| 4.168333 |
In [ ]:
filtered_data = edu[edu["TIME"] > 2005]
pivedu = pd.pivot_table(filtered_data, values = 'Value',
index = ['GEO'],
columns = ['TIME'])
pivedu.head()
Out[ ]:
TIME200620072008200920102011GEOAustriaBelgiumBulgariaCyprusCzech Republic
| 5.40 | 5.33 | 5.47 | 5.98 | 5.91 | 5.80 |
| 5.98 | 6.00 | 6.43 | 6.57 | 6.58 | 6.55 |
| 4.04 | 3.88 | 4.44 | 4.58 | 4.10 | 3.82 |
| 7.02 | 6.95 | 7.45 | 7.98 | 7.92 | 7.87 |
| 4.42 | 4.05 | 3.92 | 4.36 | 4.25 | 4.51 |
In [ ]:
pivedu.loc[['Spain','Portugal'], [2006,2011]]
Out[ ]:
TIME20062011GEOSpainPortugal
| 4.26 | 4.82 |
| 5.07 | 5.27 |
In [ ]:
pivedu = pivedu.drop([
'Euro area (13 countries)',
'Euro area (15 countries)',
'Euro area (17 countries)',
'Euro area (18 countries)',
'European Union (25 countries)',
'European Union (27 countries)',
'European Union (28 countries)',],
axis = 0)
pivedu = pivedu.rename(index = {'Germany (untill 1990 former territory of the FRG)': 'Germany'})
pivedu = pivedu.dropna()
pivedu.rank(ascending = False, method = 'first').head()
Out[ ]:
TIME200620072008200920102011GEOAustriaBelgiumBulgariaCyprusCzech Republic
| 10.0 | 7.0 | 11.0 | 7.0 | 8.0 | 8.0 |
| 5.0 | 4.0 | 3.0 | 4.0 | 5.0 | 5.0 |
| 21.0 | 21.0 | 20.0 | 20.0 | 22.0 | 22.0 |
| 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 3.0 |
| 19.0 | 20.0 | 21.0 | 21.0 | 20.0 | 19.0 |
In [ ]:
totalsum = pivedu.sum(axis = 1).sort_values(ascending = False)
totalsum.plot(kind = 'bar', style = 'b', alpha = 0.4,
title = "Total Values for Country")
Out[ ]:
<Axes: title={'center': 'Total Values for Country'}, xlabel='GEO'>

In [ ]:
my_colors = ['b', 'r', 'g', 'y', 'm', 'c']
ax = pivedu.plot(kind = 'barh',
stacked = True,
color = my_colors)
ax.legend(loc = 'center left', bbox_to_anchor = (1, .5))
Out[ ]:
<matplotlib.legend.Legend at 0x7e8e20484b20>

'학습 정리 > 따봉콩쥐야고마워' 카테고리의 다른 글
| week 5. Regression Analysis _ 현주 (0) | 2024.06.29 |
|---|---|
| week 4. Statistical Inference _ 현주 (0) | 2024.06.29 |
| week 3. Descriptive Statistics _ 현주 (0) | 2024.06.29 |
| week 1 . 초기 환경 설정 _ 현주 (0) | 2024.05.23 |