# 데모

## 라이브러리 import 및 설정

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import lightgbm as lgb
from matplotlib import pyplot as plt
from matplotlib import rcParams
import numpy as np
from pathlib import Path
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
import seaborn as sns
import warnings

In [3]:
rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
warnings.simplefilter('ignore')

## 학습데이터 로드

[03-pandas-eda.ipynb](https://github.com/kaggler-tv/dku-kaggle-class/blob/master/notebook/03-pandas-eda.ipynb)에서 생성한 `feature.csv` 피처파일 사용

In [4]:
data_dir = Path('../data/dacon-dku')
feature_dir = Path('../build/feature')
val_dir = Path('../build/val')
tst_dir = Path('../build/tst')
sub_dir = Path('../build/sub')

trn_file = data_dir / 'train.csv'
tst_file = data_dir / 'test.csv'
sample_file = data_dir / 'sample_submission.csv'

target_col = 'class'
n_fold = 5
n_class = 3
seed = 42

In [6]:
algo_name = 'lgbcv'
feature_name = 'stacking1'
model_name = f'{algo_name}_{feature_name}'

feature_file = feature_dir / f'{feature_name}.csv'
p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'
sub_file = sub_dir / f'{model_name}.csv'

## Stacking Feature 생성

In [8]:
model_names = ['lrcv_polyfeature', 'rfcv_feature', 'lgbcv_feature']
trn = []
tst = []
feature_names = []
for model in model_names:
    trn.append(np.loadtxt(val_dir / f'{model}.val.csv', delimiter=','))
    tst.append(np.loadtxt(tst_dir / f'{model}.tst.csv', delimiter=','))
    feature_names += [f'{model}_class0', f'{model}_class1', f'{model}_class2']
    
trn = np.hstack(trn)
tst = np.hstack(tst)
feature_names

['lrcv_polyfeature_class0',
 'lrcv_polyfeature_class1',
 'lrcv_polyfeature_class2',
 'rfcv_feature_class0',
 'rfcv_feature_class1',
 'rfcv_feature_class2',
 'lgbcv_feature_class0',
 'lgbcv_feature_class1',
 'lgbcv_feature_class2']

In [9]:
y = pd.read_csv(trn_file, index_col=0, usecols=['id', target_col]).values.flatten()
y.shape

(320000,)

## Stratified K-Fold Cross Validation

In [10]:
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

## LightGBM 모델 학습

In [11]:
p_val = np.zeros((trn.shape[0], n_class))
p_tst = np.zeros((tst.shape[0], n_class))
for i, (i_trn, i_val) in enumerate(cv.split(trn, y), 1):
    print(f'training model for CV #{i}')
    clf = lgb.LGBMClassifier(objective='multiclass',
                             n_estimators=1000,
                             num_leaves=64,
                             learning_rate=0.1,
                             min_child_samples=10,
                             subsample=.5,
                             subsample_freq=1,
                             colsample_bytree=.8,
                             random_state=seed,
                             n_jobs=-1)
    clf.fit(trn[i_trn], y[i_trn],
            eval_set=[(trn[i_val], y[i_val])],
            eval_metric='multiclass',
            early_stopping_rounds=10)
    
    p_val[i_val, :] = clf.predict_proba(trn[i_val])
    p_tst += clf.predict_proba(tst) / n_fold

training model for CV #1
[1]	valid_0's multi_logloss: 0.872561
Training until validation scores don't improve for 10 rounds
[2]	valid_0's multi_logloss: 0.781259
[3]	valid_0's multi_logloss: 0.705565
[4]	valid_0's multi_logloss: 0.641647
[5]	valid_0's multi_logloss: 0.586861
[6]	valid_0's multi_logloss: 0.539686
[7]	valid_0's multi_logloss: 0.498697
[8]	valid_0's multi_logloss: 0.462928
[9]	valid_0's multi_logloss: 0.431543
[10]	valid_0's multi_logloss: 0.403925
[11]	valid_0's multi_logloss: 0.379427
[12]	valid_0's multi_logloss: 0.357745
[13]	valid_0's multi_logloss: 0.33846
[14]	valid_0's multi_logloss: 0.321341
[15]	valid_0's multi_logloss: 0.306008
[16]	valid_0's multi_logloss: 0.292285
[17]	valid_0's multi_logloss: 0.280025
[18]	valid_0's multi_logloss: 0.269093
[19]	valid_0's multi_logloss: 0.259231
[20]	valid_0's multi_logloss: 0.250454
[21]	valid_0's multi_logloss: 0.242495
[22]	valid_0's multi_logloss: 0.235348
[23]	valid_0's multi_logloss: 0.228915
[24]	valid_0's multi_loglos

[12]	valid_0's multi_logloss: 0.359351
[13]	valid_0's multi_logloss: 0.340064
[14]	valid_0's multi_logloss: 0.322961
[15]	valid_0's multi_logloss: 0.30769
[16]	valid_0's multi_logloss: 0.294058
[17]	valid_0's multi_logloss: 0.281866
[18]	valid_0's multi_logloss: 0.270968
[19]	valid_0's multi_logloss: 0.261074
[20]	valid_0's multi_logloss: 0.252274
[21]	valid_0's multi_logloss: 0.244306
[22]	valid_0's multi_logloss: 0.237134
[23]	valid_0's multi_logloss: 0.230648
[24]	valid_0's multi_logloss: 0.224885
[25]	valid_0's multi_logloss: 0.219671
[26]	valid_0's multi_logloss: 0.214968
[27]	valid_0's multi_logloss: 0.210653
[28]	valid_0's multi_logloss: 0.206748
[29]	valid_0's multi_logloss: 0.203192
[30]	valid_0's multi_logloss: 0.199978
[31]	valid_0's multi_logloss: 0.19708
[32]	valid_0's multi_logloss: 0.194418
[33]	valid_0's multi_logloss: 0.191981
[34]	valid_0's multi_logloss: 0.1898
[35]	valid_0's multi_logloss: 0.187818
[36]	valid_0's multi_logloss: 0.186013
[37]	valid_0's multi_logloss:

[12]	valid_0's multi_logloss: 0.355251
[13]	valid_0's multi_logloss: 0.335915
[14]	valid_0's multi_logloss: 0.318731
[15]	valid_0's multi_logloss: 0.303479
[16]	valid_0's multi_logloss: 0.289859
[17]	valid_0's multi_logloss: 0.27766
[18]	valid_0's multi_logloss: 0.266741
[19]	valid_0's multi_logloss: 0.256899
[20]	valid_0's multi_logloss: 0.248123
[21]	valid_0's multi_logloss: 0.240227
[22]	valid_0's multi_logloss: 0.233112
[23]	valid_0's multi_logloss: 0.22671
[24]	valid_0's multi_logloss: 0.220957
[25]	valid_0's multi_logloss: 0.215741
[26]	valid_0's multi_logloss: 0.210974
[27]	valid_0's multi_logloss: 0.206716
[28]	valid_0's multi_logloss: 0.202885
[29]	valid_0's multi_logloss: 0.199392
[30]	valid_0's multi_logloss: 0.196203
[31]	valid_0's multi_logloss: 0.193356
[32]	valid_0's multi_logloss: 0.190782
[33]	valid_0's multi_logloss: 0.188429
[34]	valid_0's multi_logloss: 0.186254
[35]	valid_0's multi_logloss: 0.184275
[36]	valid_0's multi_logloss: 0.182449
[37]	valid_0's multi_loglos

In [12]:
print(f'{accuracy_score(y, np.argmax(p_val, axis=1)) * 100:.4f}%')

93.1559%


In [13]:
print(p_val.shape, p_tst.shape)

(320000, 3) (80000, 3)


In [14]:
np.savetxt(p_val_file, p_val, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, p_tst, fmt='%.6f', delimiter=',')

## 피처 중요도 시각화

In [15]:
clf.coef_.shape

AttributeError: 'LGBMClassifier' object has no attribute 'coef_'

In [None]:
imp = pd.DataFrame({'feature': feature_names, 'importance': clf.feature_importances_})
imp = imp.sort_values('importance').set_index('feature')
imp.plot(kind='barh')

## 제출 파일 생성

In [None]:
sub = pd.read_csv(sample_file, index_col=0)
print(sub.shape)
sub.head()

In [None]:
sub[target_col] = np.argmax(p_tst, axis=1)
sub.head()

In [None]:
sub[target_col].value_counts()

In [None]:
sub.to_csv(sub_file)