Notice
Recent Posts
Recent Comments
Link
«   2026/06   »
1 2 3 4 5 6
7 8 9 10 11 12 13
14 15 16 17 18 19 20
21 22 23 24 25 26 27
28 29 30
Tags
more
Archives
Today
Total
관리 메뉴

statistics program

week 2. Toolboxes for Data Scientists _ 현주 본문

학습 정리/따봉콩쥐야고마워

week 2. Toolboxes for Data Scientists _ 현주

따봉콩쥐야고마워 2024. 6. 29. 18:48

 

ch02.html
0.55MB

 

In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
 
In [ ]:
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
In [ ]:
data = { 'year' : [
        2010, 2011, 2012,
        2010, 2011, 2012,
        2010, 2011, 2012
],
         'team' : [
            'FCBarcelona', 'FCBarcelona',
            'FCBarcelona', 'RMadrid',
            'RMadrid', 'RMadrid',
            'ValenciaCF', 'ValenciaCF',
            'ValenciaCF'
         ],
         'wins': [30,28,32,29,32,26,21,17,19],
         'draws': [6,7,4,5,4,7,8,10,8],
         'losses': [2,3,2,4,2,5,9,11,11]
         }
football = pd.DataFrame(data, columns= [
    'year', 'team', 'wins', 'draws', 'losses'
]
)
football
Out[ ]:
yearteamwinsdrawslosses012345678
2010 FCBarcelona 30 6 2
2011 FCBarcelona 28 7 3
2012 FCBarcelona 32 4 2
2010 RMadrid 29 5 4
2011 RMadrid 32 4 2
2012 RMadrid 26 7 5
2010 ValenciaCF 21 8 9
2011 ValenciaCF 17 10 11
2012 ValenciaCF 19 8 11
 
In [ ]:
edu = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/educ_figdp_1_Data.csv',
                  na_values = ':',
                  usecols = ["TIME","GEO","Value"])
In [ ]:
edu.head()
Out[ ]:
TIMEGEOValue01234
2000 European Union (28 countries) NaN
2001 European Union (28 countries) NaN
2002 European Union (28 countries) 5.00
2003 European Union (28 countries) 5.03
2004 European Union (28 countries) 4.95
 
In [ ]:
edu.tail()
Out[ ]:
TIMEGEOValue379380381382383
2007 Finland 5.90
2008 Finland 6.10
2009 Finland 6.81
2010 Finland 6.85
2011 Finland 6.76
 
In [ ]:
edu.describe()
Out[ ]:
TIMEValuecountmeanstdmin25%50%75%max
384.000000 361.000000
2005.500000 5.203989
3.456556 1.021694
2000.000000 2.880000
2002.750000 4.620000
2005.500000 5.060000
2008.250000 5.660000
2011.000000 8.810000
 
In [ ]:
edu['Value']
Out[ ]:
0       NaN
1       NaN
2      5.00
3      5.03
4      4.95
       ... 
379    5.90
380    6.10
381    6.81
382    6.85
383    6.76
Name: Value, Length: 384, dtype: float64
In [ ]:
edu[10:14]
Out[ ]:
TIMEGEOValue10111213
2010 European Union (28 countries) 5.41
2011 European Union (28 countries) 5.25
2000 European Union (27 countries) 4.91
2001 European Union (27 countries) 4.99
 
In [ ]:
edu.loc[90:94, ['TIME','GEO']]
Out[ ]:
TIMEGEO9091929394
2006 Belgium
2007 Belgium
2008 Belgium
2009 Belgium
2010 Belgium
 
In [ ]:
edu[edu['Value'] > 6.5].tail()
Out[ ]:
TIMEGEOValue286287381382383
2010 Malta 6.74
2011 Malta 7.96
2009 Finland 6.81
2010 Finland 6.85
2011 Finland 6.76
 
In [ ]:
edu[edu["Value"].isnull()].head()
Out[ ]:
TIMEGEOValue01363748
2000 European Union (28 countries) NaN
2001 European Union (28 countries) NaN
2000 Euro area (18 countries) NaN
2001 Euro area (18 countries) NaN
2000 Euro area (17 countries) NaN
 
In [ ]:
edu.max(axis = 0)
Out[ ]:
TIME      2011
GEO      Spain
Value     8.81
dtype: object
In [ ]:
print("Pandas max function: ", edu['Value'].max())
print("Python max function: ", max(edu['Value']))
Pandas max function:  8.81
Python max function:  nan
In [ ]:
s = edu["Value"]/100
s.head()
Out[ ]:
0       NaN
1       NaN
2    0.0500
3    0.0503
4    0.0495
Name: Value, dtype: float64
In [ ]:
s = edu["Value"].apply(np.sqrt)
s.head()
Out[ ]:
0         NaN
1         NaN
2    2.236068
3    2.242766
4    2.224860
Name: Value, dtype: float64
In [ ]:
s = edu["Value"].apply(lambda d: d**2)
s.head()
Out[ ]:
0        NaN
1        NaN
2    25.0000
3    25.3009
4    24.5025
Name: Value, dtype: float64
In [ ]:
edu["ValueNorm"] = edu["Value"]/edu["Value"].max()
edu.tail()
Out[ ]:
TIMEGEOValueValueNorm379380381382383
2007 Finland 5.90 0.669694
2008 Finland 6.10 0.692395
2009 Finland 6.81 0.772985
2010 Finland 6.85 0.777526
2011 Finland 6.76 0.767310
 
In [ ]:
edu.drop('ValueNorm', axis = 1, inplace = True)
edu.head()
Out[ ]:
TIMEGEOValue01234
2000 European Union (28 countries) NaN
2001 European Union (28 countries) NaN
2002 European Union (28 countries) 5.00
2003 European Union (28 countries) 5.03
2004 European Union (28 countries) 4.95
 
In [ ]:
edu = edu.append({"TIME": 2000, "Value": 5.00, "GEO": 'a'},
                  ignore_index = True)
edu.tail()
<ipython-input-28-2372007df89b>:1: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  edu = edu.append({"TIME": 2000, "Value": 5.00, "GEO": 'a'},
Out[ ]:
TIMEGEOValue380381382383384
2008 Finland 6.10
2009 Finland 6.81
2010 Finland 6.85
2011 Finland 6.76
2000 a 5.00
 
In [ ]:
edu.drop(max(edu.index), axis = 0, inplace = True)
edu.tail()
Out[ ]:
TIMEGEOValue379380381382383
2007 Finland 5.90
2008 Finland 6.10
2009 Finland 6.81
2010 Finland 6.85
2011 Finland 6.76
 
In [ ]:
eduDrop = edu.drop(edu[edu["Value"].isnull()].index, axis=0)
eduDrop.head()
Out[ ]:
TIMEGEOValue23456
2002 European Union (28 countries) 5.00
2003 European Union (28 countries) 5.03
2004 European Union (28 countries) 4.95
2005 European Union (28 countries) 4.92
2006 European Union (28 countries) 4.91
 
In [ ]:
eduDrop = edu.dropna(how = 'any', subset = ["Value"])
eduDrop.head()
Out[ ]:
TIMEGEOValue23456
2002 European Union (28 countries) 5.00
2003 European Union (28 countries) 5.03
2004 European Union (28 countries) 4.95
2005 European Union (28 countries) 4.92
2006 European Union (28 countries) 4.91
 
In [ ]:
eduFilled = edu.fillna(value = {"Value":0})
eduFilled.head()
Out[ ]:
TIMEGEOValue01234
2000 European Union (28 countries) 0.00
2001 European Union (28 countries) 0.00
2002 European Union (28 countries) 5.00
2003 European Union (28 countries) 5.03
2004 European Union (28 countries) 4.95
 
In [ ]:
edu.sort_values(by = 'Value', ascending = False,
                inplace = True)
edu.head()
Out[ ]:
TIMEGEOValue130131129121122
2010 Denmark 8.81
2011 Denmark 8.75
2009 Denmark 8.74
2001 Denmark 8.44
2002 Denmark 8.44
 
In [ ]:
edu.sort_index(axis = 0, ascending = True, inplace = True)
edu.head()
Out[ ]:
TIMEGEOValue01234
2000 European Union (28 countries) NaN
2001 European Union (28 countries) NaN
2002 European Union (28 countries) 5.00
2003 European Union (28 countries) 5.03
2004 European Union (28 countries) 4.95
 
In [ ]:
group = edu[["GEO", "Value"]].groupby('GEO').mean()
group.head()
Out[ ]:
ValueGEOAustriaBelgiumBulgariaCyprusCzech Republic
5.618333
6.189091
4.093333
7.023333
4.168333
 
In [ ]:
filtered_data = edu[edu["TIME"] > 2005]
pivedu = pd.pivot_table(filtered_data, values = 'Value',
                       index = ['GEO'],
                       columns = ['TIME'])
pivedu.head()
Out[ ]:
TIME200620072008200920102011GEOAustriaBelgiumBulgariaCyprusCzech Republic
5.40 5.33 5.47 5.98 5.91 5.80
5.98 6.00 6.43 6.57 6.58 6.55
4.04 3.88 4.44 4.58 4.10 3.82
7.02 6.95 7.45 7.98 7.92 7.87
4.42 4.05 3.92 4.36 4.25 4.51
 
In [ ]:
pivedu.loc[['Spain','Portugal'], [2006,2011]]
Out[ ]:
TIME20062011GEOSpainPortugal
4.26 4.82
5.07 5.27
 
In [ ]:
pivedu = pivedu.drop([
    'Euro area (13 countries)',
    'Euro area (15 countries)',
    'Euro area (17 countries)',
    'Euro area (18 countries)',
    'European Union (25 countries)',
    'European Union (27 countries)',
    'European Union (28 countries)',],
    axis = 0)
pivedu = pivedu.rename(index = {'Germany (untill 1990 former territory of the FRG)': 'Germany'})
pivedu = pivedu.dropna()
pivedu.rank(ascending = False, method = 'first').head()
Out[ ]:
TIME200620072008200920102011GEOAustriaBelgiumBulgariaCyprusCzech Republic
10.0 7.0 11.0 7.0 8.0 8.0
5.0 4.0 3.0 4.0 5.0 5.0
21.0 21.0 20.0 20.0 22.0 22.0
2.0 2.0 2.0 2.0 2.0 3.0
19.0 20.0 21.0 21.0 20.0 19.0
 
In [ ]:
totalsum = pivedu.sum(axis = 1).sort_values(ascending = False)
totalsum.plot(kind = 'bar', style = 'b', alpha = 0.4,
              title = "Total Values for Country")
Out[ ]:
<Axes: title={'center': 'Total Values for Country'}, xlabel='GEO'>
In [ ]:
my_colors = ['b', 'r', 'g', 'y', 'm', 'c']
ax = pivedu.plot(kind = 'barh',
                 stacked = True,
                 color = my_colors)
ax.legend(loc = 'center left', bbox_to_anchor = (1, .5))
Out[ ]:
<matplotlib.legend.Legend at 0x7e8e20484b20>