Notice
Recent Posts
Recent Comments
Link
«   2026/06   »
1 2 3 4 5 6
7 8 9 10 11 12 13
14 15 16 17 18 19 20
21 22 23 24 25 26 27
28 29 30
Tags
more
Archives
Today
Total
관리 메뉴

statistics program

week 5. Regression Analysis _ 현주 본문

학습 정리/따봉콩쥐야고마워

week 5. Regression Analysis _ 현주

따봉콩쥐야고마워 2024. 6. 29. 19:35

 

ch06.ipynb
0.01MB

 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [ ]:
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
In [ ]:
ice = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/SeaIce.txt',delim_whitespace=True)
print('shape:', ice.shape)
shape: (424, 6)
In [ ]:
ice2 = ice[ice.data_type != '-9999']
ice2
Out[ ]:
yearmodata_typeregionextentarea01234...419420421422423
1979 1 Goddard N 15.54 12.33
1980 1 Goddard N 14.96 11.85
1981 1 Goddard N 15.03 11.82
1982 1 Goddard N 15.26 12.11
1983 1 Goddard N 15.10 11.92
... ... ... ... ... ...
2009 12 Goddard N 12.51 10.25
2010 12 Goddard N 12.02 10.08
2011 12 Goddard N 12.40 10.28
2012 12 Goddard N 12.20 10.11
2013 12 NRTSI-G N 12.38 10.48

422 rows × 6 columns

 
In [ ]:
import seaborn as sns
sns.lmplot(x="mo", y="extent", data=ice2)
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x7db49c125150>
In [ ]:
grouped = ice2.groupby('mo')
month_means = grouped.extent.mean()
month_means
Out[ ]:
mo
1     14.479429
2     15.298889
3     15.491714
4     14.766000
5     13.396000
6     11.860000
7      9.601143
8      7.122286
9      6.404857
10     8.809143
11    10.964722
12    13.059429
Name: extent, dtype: float64

 

for i in range(12):
    ice2.extent[ice2.mo == i+1] = 100 * (ice2.extent[ice2.mo == i+1] - month_means[i+1]) /month_means.mean()
sns.lmplot(x="mo", y="extent", data=ice2)
 
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x7db46105a2f0>
sns.lmplot(x = "year", y = "extent", data = ice2)
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x7db49c125420>
In [ ]:
from sklearn.linear_model import LinearRegression
est = LinearRegression(fit_intercept = True)
In [ ]:
x = ice2[['year']]
y = ice2[['extent']]
est.fit(x, y)
print("Coefficients: ", est.coef_)
print("Intercept: ", est.intercept_)
Coefficients:  [[-0.45275459]]
Intercept:  [903.71640207]
In [ ]:
from sklearn import metrics
y_hat = est.predict(x)
print("MSE: ", metrics.mean_squared_error(y_hat, y))
print("R^2: ", metrics.r2_score(y_hat, y))
print('var: ', y.var())
MSE:  10.539131639803488
R^2:  0.5067870382100248
var:  extent    31.98324
dtype: float64
 
boston = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/BostonHousing.xls')
boston
 
0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98 24.0 0
0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14 21.6 0
0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03 34.7 1
0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94 33.4 1
0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33 36.2 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
0.06263 0.0 11.93 0 0.573 6.593 69.1 2.4786 1 273 21.0 391.99 9.67 22.4 0
0.04527 0.0 11.93 0 0.573 6.120 76.7 2.2875 1 273 21.0 396.90 9.08 20.6 0
0.06076 0.0 11.93 0 0.573 6.976 91.0 2.1675 1 273 21.0 396.90 5.64 23.9 0
0.10959 0.0 11.93 0 0.573 6.794 89.3 2.3889 1 273 21.0 393.45 6.48 22.0 0
0.04741 0.0 11.93 0 0.573 6.030 80.8 2.5050 1 273 21.0 396.90 7.88 11.9 0

506 rows × 15 columns

 
In [ ]:
print('Shape of data: ', boston.shape)
print('Feature names: ', boston.columns)
Shape of data:  (506, 15)
Feature names:  Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'MEDV', 'CAT. MEDV'],
      dtype='object')
In [ ]:
from sklearn import datasets
from sklearn.datasets import fetch_california_housing

california_housing = fetch_california_housing()
X_california_housing, y_california_housing = california_housing.data, california_housing.target
print("Shape of data: ", X_california_housing.shape, y_california_housing.shape)
print("Feature names: ", california_housing.feature_names)
Shape of data:  (20640, 8) (20640,)
Feature names:  ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
from sklearn import linear_model
train_size = int(len(X_california_housing) / 2)
X_train = X_california_housing[:train_size]
X_test = X_california_housing[train_size:]
y_train = y_california_housing[:train_size]
y_test = y_california_housing[train_size:]
print('Training and testing set sizes', X_train.shape, X_test.shape)

regr = LinearRegression()
regr.fit(X_train, y_train)
print('Coeff and intercept: ', regr.coef_, regr.intercept_)
print('Testing Score: ', regr.score(X_test, y_test))
print('Training MSE: ', np.mean((regr.predict(X_train)- y_train)**2))
print('Testing MSE: ', np.mean((regr.predict(X_test)- y_test)**2))
Training and testing set sizes (10320, 8) (10320, 8)
Coeff and intercept:  [ 4.49445675e-01  5.25146897e-03 -1.15187965e-01  6.49831604e-01
 -4.42831175e-06 -7.49431094e-03 -4.12760239e-01 -3.65411827e-01] -28.871818045412486
Testing Score:  0.586194727280809
Training MSE:  0.5427336522220854
Testing MSE:  0.5482126919151005
In [ ]:
regr_lasso = linear_model.Lasso(alpha = .3)
regr_lasso.fit(X_train, y_train)
print('Coeff and intercept: ', regr_lasso.coef_)
print('Testing Score: ', regr_lasso.score(X_test, y_test))
print('Training MSE: ', np.mean((regr_lasso.predict(X_train)- y_train)**2))
print('Testing MSE: ', np.mean((regr_lasso.predict(X_test)- y_test)**2))
Coeff and intercept:  [ 3.39722672e-01  1.15439467e-02  0.00000000e+00  0.00000000e+00
  4.46333418e-05 -1.97394316e-03 -3.33472332e-02  0.00000000e+00]
Testing Score:  0.4767976308497144
Training MSE:  0.6586900574320259
Testing MSE:  0.69314288173142
In [ ]:
ind = np.argsort(np.abs(regr_lasso.coef_))

ind_feature_names = [california_housing.feature_names[i] for i in ind]

print('Ordered variables (from less to more important):', ind_feature_names)
Ordered variables (from less to more important): ['AveRooms', 'AveBedrms', 'Longitude', 'Population', 'AveOccup', 'HouseAge', 'Latitude', 'MedInc']
In [ ]:
import sklearn.feature_selection as fs
selector = fs.SelectKBest(score_func = fs.f_regression, k = 5)
selector.fit_transform(X_train, y_train)
selector.fit(X_train, y_train)
print('Selected features: ', list(zip(selector.get_support(), california_housing.feature_names)))
Selected features:  [(True, 'MedInc'), (True, 'HouseAge'), (True, 'AveRooms'), (False, 'AveBedrms'), (False, 'Population'), (False, 'AveOccup'), (True, 'Latitude'), (True, 'Longitude')]

 

*그래프는 붙여넣기 오류로 파일 참조