데모

라이브러리 import 및 설정

%reload_ext autoreload
%autoreload 2
%matplotlib inline
from matplotlib import rcParams, pyplot as plt
import numpy as np
import os
import pandas as pd
from pathlib import Path
import re
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, GlobalMaxPooling1D, Conv1D, Dropout, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.optimizers import Adam
import warnings 
warnings.filterwarnings(action='ignore')
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    # Restrict TensorFlow to only use the first GPU
    try:
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
    except RuntimeError as e:
        # Visible devices must be set before GPUs have been initialized
        print(e)
else:
    print('No GPU detected')
1 Physical GPUs, 1 Logical GPU
rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
warnings.simplefilter('ignore')

학습데이터 로드

data_dir = Path('../data/dacon-author-classification')
feature_dir = Path('../build/feature')
val_dir = Path('../build/val')
tst_dir = Path('../build/tst')
sub_dir = Path('../build/sub')

dirs = [feature_dir, val_dir, tst_dir, sub_dir]
for d in dirs:
    os.makedirs(d, exist_ok=True)

trn_file = data_dir / 'train.csv'
tst_file = data_dir / 'test_x.csv'
sample_file = data_dir / 'sample_submission.csv'

target_col = 'author'
n_fold = 5
n_class = 5
seed = 42
algo_name = 'lstm'
feature_name = 'emb'
model_name = f'{algo_name}_{feature_name}'

feature_file = feature_dir / f'{feature_name}.csv'
p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'
sub_file = sub_dir / f'{model_name}.csv'
train = pd.read_csv(trn_file, index_col=0)
train.head()
index text author
0 0 He was almost choking. There was so much, so m... 3
1 1 “Your sister asked for it, I suppose?” 2
2 2 She was engaged one day as she walked, in per... 1
3 3 The captain was in the porch, keeping himself ... 4
4 4 “Have mercy, gentlemen!” odin flung up his han... 3
test = pd.read_csv(tst_file, index_col=0)
test.head()
index text
0 0 “Not at all. I think she is one of the most ch...
1 1 "No," replied he, with sudden consciousness, "...
2 2 As the lady had stated her intention of scream...
3 3 “And then suddenly in the silence I heard a so...
4 4 His conviction remained unchanged. So far as I...

Preprocessing

def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text)


def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)


stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", 
             "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
             "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
             "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
             "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
             "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
             "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
             "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
train['text'] = train['text'].str.lower().apply(alpha_num).apply(remove_stopwords)
test['text'] = test['text'].str.lower().apply(alpha_num).apply(remove_stopwords)
X_train = train['text'].values
X_test = test['text'].values
y = train['author'].values
print(X_train.shape, X_test.shape, y.shape)
(54879,) (19617,) (54879,)
X_train[:3]
array(['almost choking much much wanted say strange exclamations came lips pole gazed fixedly bundle notes hand looked odin evident perplexity',
       'sister asked suppose',
       'engaged one day walked perusing janes last letter dwelling passages proved jane not written spirits instead surprised mr odin saw looking odin meeting putting away letter immediately forcing smile said'],
      dtype=object)

Training

vocab_size = 20000
embedding_dim = 64
max_length = 500
padding_type='post'
tokenizer = Tokenizer(num_words = vocab_size)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)
trn = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)
tst = pad_sequences(test_sequences, padding=padding_type, maxlen=max_length)
print(trn.shape, tst.shape)
(54879, 500) (19617, 500)
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
def get_model():
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_length),
        Bidirectional(LSTM(64, return_sequences=True)),
        Bidirectional(LSTM(64)),
        Dense(n_class, activation='softmax')
    ])
    
    model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=.01))
    return model
p_val = np.zeros((trn.shape[0], n_class))
p_tst = np.zeros((tst.shape[0], n_class))
for i, (i_trn, i_val) in enumerate(cv.split(trn, y), 1):
    print(f'training model for CV #{i}')
    clf = get_model()
    
    es = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3,
                       verbose=1, mode='min', baseline=None, restore_best_weights=True)

    clf.fit(trn[i_trn], 
            to_categorical(y[i_trn]),
            validation_data=(trn[i_val], to_categorical(y[i_val])),
            epochs=10,
            batch_size=512,
            callbacks=[es])
    p_val[i_val, :] = clf.predict(trn[i_val])
    p_tst += clf.predict(tst) / n_fold
training model for CV #1
Epoch 1/10
86/86 [==============================] - 96s 1s/step - loss: 1.0807 - val_loss: 0.7758
Epoch 2/10
86/86 [==============================] - 95s 1s/step - loss: 0.5712 - val_loss: 0.7249
Epoch 3/10
86/86 [==============================] - 95s 1s/step - loss: 0.4084 - val_loss: 0.8263
Epoch 4/10
86/86 [==============================] - 95s 1s/step - loss: 0.3189 - val_loss: 0.9040
Epoch 5/10
86/86 [==============================] - ETA: 0s - loss: 0.2599Restoring model weights from the end of the best epoch.
86/86 [==============================] - 94s 1s/step - loss: 0.2599 - val_loss: 1.0049
Epoch 00005: early stopping
training model for CV #2
Epoch 1/10
86/86 [==============================] - 79s 923ms/step - loss: 1.0285 - val_loss: 0.7780
Epoch 2/10
86/86 [==============================] - 79s 915ms/step - loss: 0.5652 - val_loss: 0.7544
Epoch 3/10
86/86 [==============================] - 77s 896ms/step - loss: 0.4114 - val_loss: 0.8300
Epoch 4/10
86/86 [==============================] - 78s 903ms/step - loss: 0.3315 - val_loss: 0.8787
Epoch 5/10
86/86 [==============================] - ETA: 0s - loss: 0.2759Restoring model weights from the end of the best epoch.
86/86 [==============================] - 77s 896ms/step - loss: 0.2759 - val_loss: 1.0170
Epoch 00005: early stopping
training model for CV #3
Epoch 1/10
86/86 [==============================] - 80s 931ms/step - loss: 1.0486 - val_loss: 0.7671
Epoch 2/10
86/86 [==============================] - 82s 957ms/step - loss: 0.5703 - val_loss: 0.7311
Epoch 3/10
86/86 [==============================] - 82s 957ms/step - loss: 0.4204 - val_loss: 0.7837
Epoch 4/10
86/86 [==============================] - 83s 965ms/step - loss: 0.3361 - val_loss: 0.8628
Epoch 5/10
86/86 [==============================] - ETA: 0s - loss: 0.2770Restoring model weights from the end of the best epoch.
86/86 [==============================] - 85s 984ms/step - loss: 0.2770 - val_loss: 0.9301
Epoch 00005: early stopping
training model for CV #4
Epoch 1/10
86/86 [==============================] - 79s 915ms/step - loss: 1.0594 - val_loss: 0.7829
Epoch 2/10
86/86 [==============================] - 83s 966ms/step - loss: 0.5818 - val_loss: 0.7291
Epoch 3/10
86/86 [==============================] - 80s 932ms/step - loss: 0.4190 - val_loss: 0.7921
Epoch 4/10
86/86 [==============================] - 79s 920ms/step - loss: 0.3295 - val_loss: 0.8403
Epoch 5/10
86/86 [==============================] - ETA: 0s - loss: 0.2773Restoring model weights from the end of the best epoch.
86/86 [==============================] - 80s 930ms/step - loss: 0.2773 - val_loss: 0.9488
Epoch 00005: early stopping
training model for CV #5
Epoch 1/10
86/86 [==============================] - 77s 893ms/step - loss: 1.0697 - val_loss: 0.7714
Epoch 2/10
86/86 [==============================] - 75s 871ms/step - loss: 0.5829 - val_loss: 0.7165
Epoch 3/10
86/86 [==============================] - 76s 882ms/step - loss: 0.4133 - val_loss: 0.7774
Epoch 4/10
86/86 [==============================] - 75s 877ms/step - loss: 0.3237 - val_loss: 0.8679
Epoch 5/10
86/86 [==============================] - ETA: 0s - loss: 0.2701Restoring model weights from the end of the best epoch.
86/86 [==============================] - 75s 874ms/step - loss: 0.2701 - val_loss: 1.0399
Epoch 00005: early stopping
print(f'Accuracy (CV): {accuracy_score(y, np.argmax(p_val, axis=1)) * 100:8.4f}%')
print(f'Log Loss (CV): {log_loss(pd.get_dummies(y), p_val):8.4f}')
Accuracy (CV):  73.1846%
Log Loss (CV):   0.7312
np.savetxt(p_val_file, p_val, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, p_tst, fmt='%.6f', delimiter=',')

시각화

print(clf.summary())
Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_4 (Embedding)      (None, 500, 64)           1280000   
_________________________________________________________________
bidirectional_8 (Bidirection (None, 500, 128)          66048     
_________________________________________________________________
bidirectional_9 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense_4 (Dense)              (None, 5)                 645       
=================================================================
Total params: 1,445,509
Trainable params: 1,445,509
Non-trainable params: 0
_________________________________________________________________
None
plot_model(clf)
('Failed to import pydot. You must `pip install pydot` and install graphviz (https://graphviz.gitlab.io/download/), ', 'for `pydotprint` to work.')

제출 파일 생성

sub = pd.read_csv(sample_file, index_col=0)
print(sub.shape)
sub.head()
(19617, 5)
0 1 2 3 4
index
0 0 0 0 0 0
1 0 0 0 0 0
2 0 0 0 0 0
3 0 0 0 0 0
4 0 0 0 0 0
sub[sub.columns] = p_tst
sub.head()
0 1 2 3 4
index
0 0.0407 0.5616 0.3168 0.0673 0.0136
1 0.0780 0.6488 0.0074 0.0058 0.2600
2 0.9714 0.0211 0.0024 0.0025 0.0026
3 0.0349 0.0148 0.8122 0.0199 0.1181
4 0.6979 0.0773 0.0068 0.1799 0.0381
sub.to_csv(sub_file)