
라이브러리 import 및 설정

%reload_ext autoreload
%autoreload 2
%matplotlib inline
!pip install -U pip
!pip install -U scikit-learn
!pip install -U tensorflow
!pip install -U tensorflow_hub
!pip install -U sentencepiece
import gc
from matplotlib import rcParams, pyplot as plt
import numpy as np
import os
import pandas as pd
from pathlib import Path
import re
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import StratifiedKFold
import sys
import tensorflow as tf
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.backend import clear_session
from tensorflow.keras.initializers import Constant
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, LSTM, Dropout
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.optimizers import Adam
import tensorflow_hub as hub
import tokenization
import warnings 
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    # Restrict TensorFlow to only use the first GPU
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
    except RuntimeError as e:
        # Visible devices must be set before GPUs have been initialized
    print('No GPU detected')
1 Physical GPUs, 1 Logical GPU
rcParams['figure.figsize'] = (16, 8)
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)

BERT Tokenizer 로드

http://nlp.stanford.edu/data/glove.6B.zip 를 다운받아 data_dir에 압축을 푼다.

data_dir = Path('../data/dacon-author-classification')
feature_dir = Path('../build/feature')
val_dir = Path('../build/val')
tst_dir = Path('../build/tst')
sub_dir = Path('../build/sub')
dirs = [feature_dir, val_dir, tst_dir, sub_dir]
for d in dirs:
    os.makedirs(d, exist_ok=True)

trn_file = data_dir / 'train.csv'
tst_file = data_dir / 'test_x.csv'
sample_file = data_dir / 'sample_submission.csv'
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"

target_col = 'author'
n_fold = 5
n_class = 5
seed = 42
algo_name = 'bert'
max_len = 100
feature_name = f'n{max_len}'
model_name = f'{algo_name}_{feature_name}'

feature_file = feature_dir / f'{feature_name}.csv'
p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'
sub_file = sub_dir / f'{model_name}.csv'
bert_layer = hub.KerasLayer(module_url, trainable=True)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

학습데이터 로드

train = pd.read_csv(trn_file, index_col=0)
text author
0 He was almost choking. There was so much, so m... 3
1 “Your sister asked for it, I suppose?” 2
2 She was engaged one day as she walked, in per... 1
3 The captain was in the porch, keeping himself ... 4
4 “Have mercy, gentlemen!” odin flung up his han... 3
test = pd.read_csv(tst_file, index_col=0)
0 “Not at all. I think she is one of the most ch...
1 "No," replied he, with sudden consciousness, "...
2 As the lady had stated her intention of scream...
3 “And then suddenly in the silence I heard a so...
4 His conviction remained unchanged. So far as I...


# https://www.kaggle.com/xhlulu/disaster-nlp-keras-bert-using-tfhub
def bert_encode(texts, tokenizer, max_len=max_len):
    all_tokens = []
    all_masks = []
    all_segments = []
    for text in texts:
        text = tokenizer.tokenize(text)
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)
trn = bert_encode(train.text.values, tokenizer, max_len=max_len)
tst = bert_encode(test.text.values, tokenizer, max_len=max_len)
y = train['author'].values
print(trn[0].shape, tst[0].shape, y.shape)
(54879, 100) (19617, 100) (54879,)


def get_model(bert_layer, max_len=max_len):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out = Dense(n_class, activation='sigmoid')(clf_output)
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    return model
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
p_val = np.zeros((trn[0].shape[0], n_class))
p_tst = np.zeros((tst[0].shape[0], n_class))
for i, (i_trn, i_val) in enumerate(cv.split(trn[0], y), 1):
    print(f'training model for CV #{i}')
    es = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3,
                       verbose=1, mode='min', baseline=None, restore_best_weights=True)
    clf = get_model(bert_layer, max_len=max_len)
    if i == 1:
    clf.fit([x[i_trn] for x in trn], 
            validation_data=([x[i_val] for x in trn], to_categorical(y[i_val])),
    p_val[i_val, :] = clf.predict([x[i_val] for x in trn])
    p_tst += clf.predict(tst) / n_fold
    del clf
training model for CV #1
Model: "functional_1"
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 100)]        0                                            
input_mask (InputLayer)         [(None, 100)]        0                                            
segment_ids (InputLayer)        [(None, 100)]        0                                            
keras_layer (KerasLayer)        [(None, 1024), (None 335141889   input_word_ids[0][0]             
tf_op_layer_strided_slice (Tens [(None, 1024)]       0           keras_layer[0][1]                
dense (Dense)                   (None, 5)            5125        tf_op_layer_strided_slice[0][0]  
Total params: 335,147,014
Trainable params: 335,147,013
Non-trainable params: 1
Epoch 1/2
2744/2744 [==============================] - 2489s 907ms/step - loss: 0.2035 - accuracy: 0.7820 - val_loss: 0.1440 - val_accuracy: 0.8507
Epoch 2/2
2744/2744 [==============================] - 2478s 903ms/step - loss: 0.0823 - accuracy: 0.9204 - val_loss: 0.1395 - val_accuracy: 0.8622
training model for CV #2
Epoch 1/2
2744/2744 [==============================] - 2496s 910ms/step - loss: 0.0785 - accuracy: 0.9289 - val_loss: 0.0422 - val_accuracy: 0.9635
Epoch 2/2
2744/2744 [==============================] - 2688s 979ms/step - loss: 0.0263 - accuracy: 0.9771 - val_loss: 0.0533 - val_accuracy: 0.9503
training model for CV #3
Epoch 1/2
2744/2744 [==============================] - 2510s 915ms/step - loss: 0.0290 - accuracy: 0.9767 - val_loss: 0.0151 - val_accuracy: 0.9873
Epoch 2/2
2744/2744 [==============================] - 2492s 908ms/step - loss: 0.0158 - accuracy: 0.9875 - val_loss: 0.0224 - val_accuracy: 0.9806
training model for CV #4
Epoch 1/2
2744/2744 [==============================] - 2429s 885ms/step - loss: 0.0174 - accuracy: 0.9869 - val_loss: 0.0125 - val_accuracy: 0.9900
Epoch 2/2
2744/2744 [==============================] - 2425s 884ms/step - loss: 0.0089 - accuracy: 0.9928 - val_loss: 0.0263 - val_accuracy: 0.9809
training model for CV #5
Epoch 1/2
2744/2744 [==============================] - 2433s 886ms/step - loss: 0.0147 - accuracy: 0.9897 - val_loss: 0.0061 - val_accuracy: 0.9950
Epoch 2/2
2744/2744 [==============================] - 2443s 890ms/step - loss: 0.0075 - accuracy: 0.9944 - val_loss: 0.0084 - val_accuracy: 0.9929
print(f'Accuracy (CV): {accuracy_score(y, np.argmax(p_val, axis=1)) * 100:8.4f}%')
print(f'Log Loss (CV): {log_loss(pd.get_dummies(y), p_val):8.4f}')
Accuracy (CV):  95.3370%
Log Loss (CV):   0.1380
np.savetxt(p_val_file, p_val, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, p_tst, fmt='%.6f', delimiter=',')

제출 파일 생성

sub = pd.read_csv(sample_file, index_col=0)
(19617, 5)
0 1 2 3 4
0 0 0 0 0 0
1 0 0 0 0 0
2 0 0 0 0 0
3 0 0 0 0 0
4 0 0 0 0 0
sub[sub.columns] = p_tst
0 1 2 3 4
0 6.4148e-04 0.6342 0.3968 3.2864e-04 2.2312e-04
1 7.3567e-05 0.9995 0.0002 2.9280e-04 1.9901e-04
2 9.9783e-01 0.0043 0.0001 8.8071e-05 6.5895e-05
3 4.2704e-05 0.0002 0.9994 1.2805e-04 8.2788e-05
4 9.9781e-01 0.0013 0.0004 6.9908e-04 2.0771e-04