데모

라이브러리 import 및 설정

%reload_ext autoreload
%autoreload 2
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import rcParams
import numpy as np
from pathlib import Path
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import seaborn as sns
import warnings
rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
warnings.simplefilter('ignore')

학습데이터 로드

03-pandas-eda.ipynb에서 생성한 feature.csv 피처파일 사용

data_dir = Path('../data/dacon-dku')
feature_dir = Path('../build/feature')
val_dir = Path('../build/val')
tst_dir = Path('../build/tst')
sub_dir = Path('../build/sub')

trn_file = data_dir / 'train.csv'
tst_file = data_dir / 'test.csv'
sample_file = data_dir / 'sample_submission.csv'

target_col = 'class'
n_fold = 5
n_class = 3
seed = 42
algo_name = 'lrcv'
feature_name = 'polyfeature'
model_name = f'{algo_name}_{feature_name}'

feature_file = feature_dir / f'{feature_name}.csv'
p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'
sub_file = sub_dir / f'{model_name}.csv'

Polynomial Feature 생성

df = pd.read_csv(feature_dir / 'feature.csv', index_col=0)
print(df.shape)
df.head()
(400000, 20)
z redshift dered_u dered_g dered_r dered_i dered_z nObserve airmass_u class d_dered_u d_dered_g d_dered_r d_dered_i d_dered_z d_dered_ig d_dered_zg d_dered_rz d_dered_iz d_obs_det
id
0 16.9396 -8.1086e-05 23.1243 20.2578 18.9551 17.6321 16.9089 2.9444 1.1898 0.0 -0.1397 -0.0790 -0.0544 -0.0403 -0.0307 -2.6257 -3.3488 2.0462 0.7232 -15.0556
1 13.1689 4.5061e-03 14.9664 14.0045 13.4114 13.2363 13.1347 0.6931 1.2533 1.0 -0.0857 -0.0574 -0.0410 -0.0322 -0.0343 -0.7683 -0.8698 0.2767 0.1016 -0.3069
2 15.3500 4.7198e-04 16.6076 15.6866 15.4400 15.3217 15.2961 1.0986 1.0225 0.0 -0.1787 -0.1388 -0.0963 -0.0718 -0.0540 -0.3649 -0.3905 0.1440 0.0257 -0.9014
3 19.6346 5.8143e-06 25.3536 20.9947 20.0873 19.7947 19.5552 1.6094 1.2054 0.0 -0.3070 -0.1941 -0.1339 -0.1003 -0.0795 -1.2000 -1.4395 0.5321 0.2395 -1.3906
4 17.9826 -3.3247e-05 23.7714 20.4338 18.8630 18.1903 17.8759 2.6391 1.1939 0.0 -0.6820 -0.2653 -0.1794 -0.1339 -0.1067 -2.2436 -2.5579 0.9871 0.3144 -9.3609
scaler = StandardScaler()
poly = PolynomialFeatures(2)
X = poly.fit_transform(scaler.fit_transform(df.drop(target_col, axis=1)))
feature_names = poly.get_feature_names(df.columns)
feature_names
['1',
 'z',
 'redshift',
 'dered_u',
 'dered_g',
 'dered_r',
 'dered_i',
 'dered_z',
 'nObserve',
 'airmass_u',
 'class',
 'd_dered_u',
 'd_dered_g',
 'd_dered_r',
 'd_dered_i',
 'd_dered_z',
 'd_dered_ig',
 'd_dered_zg',
 'd_dered_rz',
 'd_dered_iz',
 'z^2',
 'z redshift',
 'z dered_u',
 'z dered_g',
 'z dered_r',
 'z dered_i',
 'z dered_z',
 'z nObserve',
 'z airmass_u',
 'z class',
 'z d_dered_u',
 'z d_dered_g',
 'z d_dered_r',
 'z d_dered_i',
 'z d_dered_z',
 'z d_dered_ig',
 'z d_dered_zg',
 'z d_dered_rz',
 'z d_dered_iz',
 'redshift^2',
 'redshift dered_u',
 'redshift dered_g',
 'redshift dered_r',
 'redshift dered_i',
 'redshift dered_z',
 'redshift nObserve',
 'redshift airmass_u',
 'redshift class',
 'redshift d_dered_u',
 'redshift d_dered_g',
 'redshift d_dered_r',
 'redshift d_dered_i',
 'redshift d_dered_z',
 'redshift d_dered_ig',
 'redshift d_dered_zg',
 'redshift d_dered_rz',
 'redshift d_dered_iz',
 'dered_u^2',
 'dered_u dered_g',
 'dered_u dered_r',
 'dered_u dered_i',
 'dered_u dered_z',
 'dered_u nObserve',
 'dered_u airmass_u',
 'dered_u class',
 'dered_u d_dered_u',
 'dered_u d_dered_g',
 'dered_u d_dered_r',
 'dered_u d_dered_i',
 'dered_u d_dered_z',
 'dered_u d_dered_ig',
 'dered_u d_dered_zg',
 'dered_u d_dered_rz',
 'dered_u d_dered_iz',
 'dered_g^2',
 'dered_g dered_r',
 'dered_g dered_i',
 'dered_g dered_z',
 'dered_g nObserve',
 'dered_g airmass_u',
 'dered_g class',
 'dered_g d_dered_u',
 'dered_g d_dered_g',
 'dered_g d_dered_r',
 'dered_g d_dered_i',
 'dered_g d_dered_z',
 'dered_g d_dered_ig',
 'dered_g d_dered_zg',
 'dered_g d_dered_rz',
 'dered_g d_dered_iz',
 'dered_r^2',
 'dered_r dered_i',
 'dered_r dered_z',
 'dered_r nObserve',
 'dered_r airmass_u',
 'dered_r class',
 'dered_r d_dered_u',
 'dered_r d_dered_g',
 'dered_r d_dered_r',
 'dered_r d_dered_i',
 'dered_r d_dered_z',
 'dered_r d_dered_ig',
 'dered_r d_dered_zg',
 'dered_r d_dered_rz',
 'dered_r d_dered_iz',
 'dered_i^2',
 'dered_i dered_z',
 'dered_i nObserve',
 'dered_i airmass_u',
 'dered_i class',
 'dered_i d_dered_u',
 'dered_i d_dered_g',
 'dered_i d_dered_r',
 'dered_i d_dered_i',
 'dered_i d_dered_z',
 'dered_i d_dered_ig',
 'dered_i d_dered_zg',
 'dered_i d_dered_rz',
 'dered_i d_dered_iz',
 'dered_z^2',
 'dered_z nObserve',
 'dered_z airmass_u',
 'dered_z class',
 'dered_z d_dered_u',
 'dered_z d_dered_g',
 'dered_z d_dered_r',
 'dered_z d_dered_i',
 'dered_z d_dered_z',
 'dered_z d_dered_ig',
 'dered_z d_dered_zg',
 'dered_z d_dered_rz',
 'dered_z d_dered_iz',
 'nObserve^2',
 'nObserve airmass_u',
 'nObserve class',
 'nObserve d_dered_u',
 'nObserve d_dered_g',
 'nObserve d_dered_r',
 'nObserve d_dered_i',
 'nObserve d_dered_z',
 'nObserve d_dered_ig',
 'nObserve d_dered_zg',
 'nObserve d_dered_rz',
 'nObserve d_dered_iz',
 'airmass_u^2',
 'airmass_u class',
 'airmass_u d_dered_u',
 'airmass_u d_dered_g',
 'airmass_u d_dered_r',
 'airmass_u d_dered_i',
 'airmass_u d_dered_z',
 'airmass_u d_dered_ig',
 'airmass_u d_dered_zg',
 'airmass_u d_dered_rz',
 'airmass_u d_dered_iz',
 'class^2',
 'class d_dered_u',
 'class d_dered_g',
 'class d_dered_r',
 'class d_dered_i',
 'class d_dered_z',
 'class d_dered_ig',
 'class d_dered_zg',
 'class d_dered_rz',
 'class d_dered_iz',
 'd_dered_u^2',
 'd_dered_u d_dered_g',
 'd_dered_u d_dered_r',
 'd_dered_u d_dered_i',
 'd_dered_u d_dered_z',
 'd_dered_u d_dered_ig',
 'd_dered_u d_dered_zg',
 'd_dered_u d_dered_rz',
 'd_dered_u d_dered_iz',
 'd_dered_g^2',
 'd_dered_g d_dered_r',
 'd_dered_g d_dered_i',
 'd_dered_g d_dered_z',
 'd_dered_g d_dered_ig',
 'd_dered_g d_dered_zg',
 'd_dered_g d_dered_rz',
 'd_dered_g d_dered_iz',
 'd_dered_r^2',
 'd_dered_r d_dered_i',
 'd_dered_r d_dered_z',
 'd_dered_r d_dered_ig',
 'd_dered_r d_dered_zg',
 'd_dered_r d_dered_rz',
 'd_dered_r d_dered_iz',
 'd_dered_i^2',
 'd_dered_i d_dered_z',
 'd_dered_i d_dered_ig',
 'd_dered_i d_dered_zg',
 'd_dered_i d_dered_rz',
 'd_dered_i d_dered_iz',
 'd_dered_z^2',
 'd_dered_z d_dered_ig',
 'd_dered_z d_dered_zg',
 'd_dered_z d_dered_rz',
 'd_dered_z d_dered_iz',
 'd_dered_ig^2',
 'd_dered_ig d_dered_zg',
 'd_dered_ig d_dered_rz',
 'd_dered_ig d_dered_iz',
 'd_dered_zg^2',
 'd_dered_zg d_dered_rz',
 'd_dered_zg d_dered_iz',
 'd_dered_rz^2',
 'd_dered_rz d_dered_iz',
 'd_dered_iz^2']
df_poly = pd.DataFrame(data=X, columns=feature_names, index=df.index)
df_poly[target_col] = df[target_col]
df_poly.head()
df_poly.to_csv(feature_file)
del df_poly, df
df = pd.read_csv(feature_file, index_col=0)
print(df.shape)
df.head()
(400000, 210)
1 z redshift dered_u dered_g dered_r dered_i dered_z nObserve airmass_u class d_dered_u d_dered_g d_dered_r d_dered_i d_dered_z d_dered_ig d_dered_zg d_dered_rz d_dered_iz z^2 z redshift z dered_u z dered_g z dered_r z dered_i z dered_z z nObserve z airmass_u z class z d_dered_u z d_dered_g z d_dered_r z d_dered_i z d_dered_z z d_dered_ig z d_dered_zg z d_dered_rz z d_dered_iz redshift^2 redshift dered_u redshift dered_g redshift dered_r redshift dered_i redshift dered_z redshift nObserve redshift airmass_u redshift class redshift d_dered_u redshift d_dered_g ... class d_dered_z class d_dered_ig class d_dered_zg class d_dered_rz class d_dered_iz d_dered_u^2 d_dered_u d_dered_g d_dered_u d_dered_r d_dered_u d_dered_i d_dered_u d_dered_z d_dered_u d_dered_ig d_dered_u d_dered_zg d_dered_u d_dered_rz d_dered_u d_dered_iz d_dered_g^2 d_dered_g d_dered_r d_dered_g d_dered_i d_dered_g d_dered_z d_dered_g d_dered_ig d_dered_g d_dered_zg d_dered_g d_dered_rz d_dered_g d_dered_iz d_dered_r^2 d_dered_r d_dered_i d_dered_r d_dered_z d_dered_r d_dered_ig d_dered_r d_dered_zg d_dered_r d_dered_rz d_dered_r d_dered_iz d_dered_i^2 d_dered_i d_dered_z d_dered_i d_dered_ig d_dered_i d_dered_zg d_dered_i d_dered_rz d_dered_i d_dered_iz d_dered_z^2 d_dered_z d_dered_ig d_dered_z d_dered_zg d_dered_z d_dered_rz d_dered_z d_dered_iz d_dered_ig^2 d_dered_ig d_dered_zg d_dered_ig d_dered_rz d_dered_ig d_dered_iz d_dered_zg^2 d_dered_zg d_dered_rz d_dered_zg d_dered_iz d_dered_rz^2 d_dered_rz d_dered_iz d_dered_iz^2
id
0 1.0 0.0002 -0.1093 1.8170 0.1270 0.9086 0.0208 0.0011 1.6087 0.1200 0.0 0.0063 0.1783 -0.0002 0.0008 -0.0981 -0.0626 0.0367 0.0156 -1.3511 3.4181e-08 -2.0203e-05 0.0003 2.3488e-05 0.0002 3.8455e-06 2.0594e-07 0.0003 2.2191e-05 2.2235e-05 1.1669e-06 3.2960e-05 -3.7490e-08 1.4942e-07 -1.8144e-05 -1.1575e-05 6.7832e-06 2.8794e-06 -0.0002 0.0119 -0.1986 -0.0139 -0.0993 -0.0023 -0.0001 -0.1758 -0.0131 -0.0131 -0.0007 -0.0195 ... -0.0118 -0.0075 0.0044 0.0019 -0.1625 3.9834e-05 1.1252e-03 -1.2798e-06 5.1009e-06 -6.1941e-04 -3.9513e-04 2.3156e-04 9.8297e-05 -0.0085 0.0318 -3.6150e-05 1.4408e-04 -0.0175 -1.1161e-02 0.0065 2.7766e-03 -0.2409 4.1119e-08 -1.6389e-07 1.9901e-05 1.2695e-05 -7.4398e-06 -3.1582e-06 2.7398e-04 6.5318e-07 -7.9317e-05 -5.0598e-05 2.9652e-05 1.2587e-05 -1.0920e-03 9.6316e-03 6.1442e-03 -3.6007e-03 -1.5285e-03 0.1326 3.9195e-03 -2.2970e-03 -9.7505e-04 0.0846 1.3461e-03 5.7142e-04 -0.0496 2.4256e-04 -0.0210 1.8255
1 1.0 -0.0719 -0.1007 -2.4251 -0.2656 -2.8534 -0.1751 -0.1054 -0.8761 0.6658 1.0 0.0081 0.2144 0.0001 0.0007 0.0190 0.0156 -0.0133 -0.0071 0.5650 5.1738e-03 7.2464e-03 0.1744 1.9104e-02 0.2052 1.2597e-02 7.5809e-03 0.0630 -4.7893e-02 -1.4200e-02 -5.8349e-04 -1.5419e-02 -8.7153e-06 -5.1123e-05 -1.3637e-03 -1.1239e-03 9.5460e-04 5.0978e-04 -0.0406 0.0101 0.2443 0.0268 0.2875 0.0176 0.0106 0.0883 -0.0671 -0.0199 -0.0008 -0.0216 ... 0.0037 0.0031 -0.0026 -0.0014 0.1115 6.5806e-05 1.7389e-03 9.8290e-07 5.7656e-06 1.5379e-04 1.2675e-04 -1.0766e-04 -5.7493e-05 0.0046 0.0460 2.5973e-05 1.5236e-04 0.0041 3.3493e-03 -0.0028 -1.5193e-03 0.1211 1.4681e-08 8.6118e-08 2.2971e-06 1.8932e-06 -1.6081e-06 -8.5874e-07 6.8457e-05 5.0517e-07 1.3475e-05 1.1105e-05 -9.4327e-06 -5.0373e-06 4.0156e-04 3.5943e-04 2.9622e-04 -2.5161e-04 -1.3437e-04 0.0107 2.4412e-04 -2.0736e-04 -1.1074e-04 0.0088 1.7613e-04 9.4060e-05 -0.0075 5.0230e-05 -0.0040 0.3192
2 1.0 -0.0302 -0.1082 -1.5717 -0.1600 -1.4768 -0.0822 -0.0444 -0.4286 -1.3179 0.0 0.0013 0.0650 -0.0015 0.0002 0.0444 0.0308 -0.0170 -0.0099 0.4877 9.1298e-04 3.2708e-03 0.0475 4.8338e-03 0.0446 2.4830e-03 1.3416e-03 0.0130 3.9822e-02 -1.9476e-03 -3.9487e-05 -1.9652e-03 4.4024e-05 -5.3424e-06 -1.3413e-03 -9.2915e-04 5.1426e-04 2.9778e-04 -0.0147 0.0117 0.1701 0.0173 0.1599 0.0089 0.0048 0.0464 0.1427 -0.0070 -0.0001 -0.0070 ... 0.0029 0.0020 -0.0011 -0.0006 0.0314 1.7078e-06 8.4993e-05 -1.9040e-06 2.3106e-07 5.8012e-05 4.0186e-05 -2.2242e-05 -1.2879e-05 0.0006 0.0042 -9.4759e-05 1.1499e-05 0.0029 2.0000e-03 -0.0011 -6.4096e-04 0.0317 2.1228e-06 -2.5761e-07 -6.4677e-05 -4.4803e-05 2.4797e-05 1.4359e-05 -7.1063e-04 3.1262e-08 7.8488e-06 5.4370e-06 -3.0093e-06 -1.7425e-06 8.6238e-05 1.9706e-03 1.3651e-03 -7.5553e-04 -4.3748e-04 0.0217 9.4561e-04 -5.2337e-04 -3.0305e-04 0.0150 2.8967e-04 1.6773e-04 -0.0083 9.7124e-05 -0.0048 0.2379
3 1.0 0.0517 -0.1091 2.9763 0.1733 1.6769 0.1172 0.0758 0.1352 0.2544 0.0 -0.0033 -0.0366 -0.0026 -0.0005 -0.0083 -0.0024 -0.0061 -0.0021 0.4242 2.6757e-03 -5.6442e-03 0.1540 8.9650e-03 0.0867 6.0617e-03 3.9204e-03 0.0070 1.3162e-02 -6.1500e-03 -1.7133e-04 -1.8939e-03 -1.3410e-04 -2.6646e-05 -4.2725e-04 -1.2163e-04 -3.1351e-04 -1.0655e-04 0.0219 0.0119 -0.3248 -0.0189 -0.1830 -0.0128 -0.0083 -0.0148 -0.0278 0.0130 0.0004 0.0040 ... 0.0010 0.0003 0.0007 0.0002 -0.0504 1.0971e-05 1.2127e-04 8.5865e-06 1.7062e-06 2.7358e-05 7.7883e-06 2.0075e-05 6.8228e-06 -0.0014 0.0013 9.4914e-05 1.8860e-05 0.0003 8.6090e-05 0.0002 7.5418e-05 -0.0155 6.7204e-06 1.3354e-06 2.1412e-05 6.0957e-06 1.5712e-05 5.3400e-06 -1.0997e-03 2.6535e-07 4.2547e-06 1.2112e-06 3.1221e-06 1.0611e-06 -2.1851e-04 6.8222e-05 1.9422e-05 5.0061e-05 1.7014e-05 -0.0035 5.5290e-06 1.4251e-05 4.8436e-06 -0.0010 3.6734e-05 1.2485e-05 -0.0026 4.2431e-06 -0.0009 0.1799
4 1.0 0.0201 -0.1092 2.1535 0.1381 0.8461 0.0457 0.0284 1.2717 0.1560 0.0 -0.0093 -0.1596 -0.0039 -0.0013 -0.0741 -0.0376 0.0068 0.0007 -0.6113 4.0534e-04 -2.1983e-03 0.0434 2.7804e-03 0.0170 9.1963e-04 5.7181e-04 0.0256 3.1404e-03 -1.3188e-02 -1.8663e-04 -3.2124e-03 -7.9125e-05 -2.5252e-05 -1.4909e-03 -7.5793e-04 1.3661e-04 1.3497e-05 -0.0123 0.0119 -0.2351 -0.0151 -0.0924 -0.0050 -0.0031 -0.1389 -0.0170 0.0715 0.0010 0.0174 ... 0.0485 0.0247 -0.0044 -0.0004 0.4004 8.5934e-05 1.4791e-03 3.6433e-05 1.1627e-05 6.8645e-04 3.4898e-04 -6.2902e-05 -6.2147e-06 0.0057 0.0255 6.2708e-04 2.0013e-04 0.0118 6.0068e-03 -0.0011 -1.0697e-04 0.0975 1.5446e-05 4.9295e-06 2.9103e-04 1.4796e-04 -2.6668e-05 -2.6348e-06 2.4024e-03 1.5732e-06 9.2880e-05 4.7219e-05 -8.5109e-06 -8.4088e-07 7.6673e-04 5.4834e-03 2.7877e-03 -5.0247e-04 -4.9644e-05 0.0453 1.4173e-03 -2.5545e-04 -2.5238e-05 0.0230 4.6043e-05 4.5490e-06 -0.0041 4.4944e-07 -0.0004 0.3737

5 rows × 210 columns

y = df[target_col].values[:320000]
df.drop(target_col, axis=1, inplace=True)
trn = df.iloc[:320000].values
tst = df.iloc[320000:].values
feature_name = df.columns.tolist()
print(y.shape, trn.shape, tst.shape)
(320000,) (320000, 209) (80000, 209)

Stratified K-Fold Cross Validation

cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

Logistic Regression 모델 학습

p_val = np.zeros((trn.shape[0], n_class))
p_tst = np.zeros((tst.shape[0], n_class))
for i, (i_trn, i_val) in enumerate(cv.split(trn, y), 1):
    print(f'training model for CV #{i}')
    clf = LogisticRegression(multi_class='multinomial')
    clf.fit(trn[i_trn], y[i_trn])
    p_val[i_val, :] = clf.predict_proba(trn[i_val])
    p_tst += clf.predict_proba(tst) / n_fold
training model for CV #1
training model for CV #2
training model for CV #3
training model for CV #4
training model for CV #5
print(f'{accuracy_score(y, np.argmax(p_val, axis=1)) * 100:.4f}%')
89.6659%
print(p_val.shape, p_tst.shape)
(320000, 3) (80000, 3)
np.savetxt(p_val_file, p_val, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, p_tst, fmt='%.6f', delimiter=',')

피처 중요도 시각화

clf.coef_.shape
(3, 209)
imp = pd.DataFrame({'feature': df.columns, 'importance': clf.coef_[1, :].T})
imp = imp.sort_values('importance').set_index('feature')
imp.plot(kind='barh', figsize=(8, 32))
<matplotlib.axes._subplots.AxesSubplot at 0x7fa3b82d9390>
../_images/08-lr-cv_25_1.png

제출 파일 생성

sub = pd.read_csv(sample_file, index_col=0)
print(sub.shape)
sub.head()
(80000, 1)
class
id
320000 0
320001 0
320002 0
320003 0
320004 0
sub[target_col] = np.argmax(p_tst, axis=1)
sub.head()
class
id
320000 2
320001 0
320002 2
320003 0
320004 2
sub[target_col].value_counts()
2    42870
0    31013
1     6117
Name: class, dtype: int64
sub.to_csv(sub_file)