statistics program
week 5. Regression Analysis _ 현주 본문

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [ ]:
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
In [ ]:
ice = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/SeaIce.txt',delim_whitespace=True)
print('shape:', ice.shape)
shape: (424, 6)
In [ ]:
ice2 = ice[ice.data_type != '-9999']
ice2
Out[ ]:
yearmodata_typeregionextentarea01234...419420421422423
| 1979 | 1 | Goddard | N | 15.54 | 12.33 |
| 1980 | 1 | Goddard | N | 14.96 | 11.85 |
| 1981 | 1 | Goddard | N | 15.03 | 11.82 |
| 1982 | 1 | Goddard | N | 15.26 | 12.11 |
| 1983 | 1 | Goddard | N | 15.10 | 11.92 |
| ... | ... | ... | ... | ... | ... |
| 2009 | 12 | Goddard | N | 12.51 | 10.25 |
| 2010 | 12 | Goddard | N | 12.02 | 10.08 |
| 2011 | 12 | Goddard | N | 12.40 | 10.28 |
| 2012 | 12 | Goddard | N | 12.20 | 10.11 |
| 2013 | 12 | NRTSI-G | N | 12.38 | 10.48 |
422 rows × 6 columns
In [ ]:
import seaborn as sns
sns.lmplot(x="mo", y="extent", data=ice2)
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x7db49c125150>

In [ ]:
grouped = ice2.groupby('mo')
month_means = grouped.extent.mean()
month_means
Out[ ]:
mo
1 14.479429
2 15.298889
3 15.491714
4 14.766000
5 13.396000
6 11.860000
7 9.601143
8 7.122286
9 6.404857
10 8.809143
11 10.964722
12 13.059429
Name: extent, dtype: float64
for i in range(12):
ice2.extent[ice2.mo == i+1] = 100 * (ice2.extent[ice2.mo == i+1] - month_means[i+1]) /month_means.mean()
sns.lmplot(x="mo", y="extent", data=ice2)
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x7db46105a2f0>

sns.lmplot(x = "year", y = "extent", data = ice2)
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x7db49c125420>

In [ ]:
from sklearn.linear_model import LinearRegression
est = LinearRegression(fit_intercept = True)
In [ ]:
x = ice2[['year']]
y = ice2[['extent']]
est.fit(x, y)
print("Coefficients: ", est.coef_)
print("Intercept: ", est.intercept_)
Coefficients: [[-0.45275459]]
Intercept: [903.71640207]
In [ ]:
from sklearn import metrics
y_hat = est.predict(x)
print("MSE: ", metrics.mean_squared_error(y_hat, y))
print("R^2: ", metrics.r2_score(y_hat, y))
print('var: ', y.var())
MSE: 10.539131639803488
R^2: 0.5067870382100248
var: extent 31.98324
dtype: float64
boston = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/BostonHousing.xls')
boston
| 0.00632 | 18.0 | 2.31 | 0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1 | 296 | 15.3 | 396.90 | 4.98 | 24.0 | 0 |
| 0.02731 | 0.0 | 7.07 | 0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2 | 242 | 17.8 | 396.90 | 9.14 | 21.6 | 0 |
| 0.02729 | 0.0 | 7.07 | 0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2 | 242 | 17.8 | 392.83 | 4.03 | 34.7 | 1 |
| 0.03237 | 0.0 | 2.18 | 0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3 | 222 | 18.7 | 394.63 | 2.94 | 33.4 | 1 |
| 0.06905 | 0.0 | 2.18 | 0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3 | 222 | 18.7 | 396.90 | 5.33 | 36.2 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 0.06263 | 0.0 | 11.93 | 0 | 0.573 | 6.593 | 69.1 | 2.4786 | 1 | 273 | 21.0 | 391.99 | 9.67 | 22.4 | 0 |
| 0.04527 | 0.0 | 11.93 | 0 | 0.573 | 6.120 | 76.7 | 2.2875 | 1 | 273 | 21.0 | 396.90 | 9.08 | 20.6 | 0 |
| 0.06076 | 0.0 | 11.93 | 0 | 0.573 | 6.976 | 91.0 | 2.1675 | 1 | 273 | 21.0 | 396.90 | 5.64 | 23.9 | 0 |
| 0.10959 | 0.0 | 11.93 | 0 | 0.573 | 6.794 | 89.3 | 2.3889 | 1 | 273 | 21.0 | 393.45 | 6.48 | 22.0 | 0 |
| 0.04741 | 0.0 | 11.93 | 0 | 0.573 | 6.030 | 80.8 | 2.5050 | 1 | 273 | 21.0 | 396.90 | 7.88 | 11.9 | 0 |
506 rows × 15 columns
In [ ]:
print('Shape of data: ', boston.shape)
print('Feature names: ', boston.columns)
Shape of data: (506, 15)
Feature names: Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
'PTRATIO', 'B', 'LSTAT', 'MEDV', 'CAT. MEDV'],
dtype='object')
In [ ]:
from sklearn import datasets
from sklearn.datasets import fetch_california_housing
california_housing = fetch_california_housing()
X_california_housing, y_california_housing = california_housing.data, california_housing.target
print("Shape of data: ", X_california_housing.shape, y_california_housing.shape)
print("Feature names: ", california_housing.feature_names)
Shape of data: (20640, 8) (20640,)
Feature names: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
from sklearn import linear_model
train_size = int(len(X_california_housing) / 2)
X_train = X_california_housing[:train_size]
X_test = X_california_housing[train_size:]
y_train = y_california_housing[:train_size]
y_test = y_california_housing[train_size:]
print('Training and testing set sizes', X_train.shape, X_test.shape)
regr = LinearRegression()
regr.fit(X_train, y_train)
print('Coeff and intercept: ', regr.coef_, regr.intercept_)
print('Testing Score: ', regr.score(X_test, y_test))
print('Training MSE: ', np.mean((regr.predict(X_train)- y_train)**2))
print('Testing MSE: ', np.mean((regr.predict(X_test)- y_test)**2))
Training and testing set sizes (10320, 8) (10320, 8)
Coeff and intercept: [ 4.49445675e-01 5.25146897e-03 -1.15187965e-01 6.49831604e-01
-4.42831175e-06 -7.49431094e-03 -4.12760239e-01 -3.65411827e-01] -28.871818045412486
Testing Score: 0.586194727280809
Training MSE: 0.5427336522220854
Testing MSE: 0.5482126919151005
In [ ]:
regr_lasso = linear_model.Lasso(alpha = .3)
regr_lasso.fit(X_train, y_train)
print('Coeff and intercept: ', regr_lasso.coef_)
print('Testing Score: ', regr_lasso.score(X_test, y_test))
print('Training MSE: ', np.mean((regr_lasso.predict(X_train)- y_train)**2))
print('Testing MSE: ', np.mean((regr_lasso.predict(X_test)- y_test)**2))
Coeff and intercept: [ 3.39722672e-01 1.15439467e-02 0.00000000e+00 0.00000000e+00
4.46333418e-05 -1.97394316e-03 -3.33472332e-02 0.00000000e+00]
Testing Score: 0.4767976308497144
Training MSE: 0.6586900574320259
Testing MSE: 0.69314288173142
In [ ]:
ind = np.argsort(np.abs(regr_lasso.coef_))
ind_feature_names = [california_housing.feature_names[i] for i in ind]
print('Ordered variables (from less to more important):', ind_feature_names)
Ordered variables (from less to more important): ['AveRooms', 'AveBedrms', 'Longitude', 'Population', 'AveOccup', 'HouseAge', 'Latitude', 'MedInc']
In [ ]:
import sklearn.feature_selection as fs
selector = fs.SelectKBest(score_func = fs.f_regression, k = 5)
selector.fit_transform(X_train, y_train)
selector.fit(X_train, y_train)
print('Selected features: ', list(zip(selector.get_support(), california_housing.feature_names)))
Selected features: [(True, 'MedInc'), (True, 'HouseAge'), (True, 'AveRooms'), (False, 'AveBedrms'), (False, 'Population'), (False, 'AveOccup'), (True, 'Latitude'), (True, 'Longitude')]
*그래프는 붙여넣기 오류로 파일 참조
'학습 정리 > 따봉콩쥐야고마워' 카테고리의 다른 글
| week 4. Statistical Inference _ 현주 (0) | 2024.06.29 |
|---|---|
| week 3. Descriptive Statistics _ 현주 (0) | 2024.06.29 |
| week 2. Toolboxes for Data Scientists _ 현주 (0) | 2024.06.29 |
| week 1 . 초기 환경 설정 _ 현주 (0) | 2024.05.23 |