라이브러리 import 및 설정¶
%reload_ext autoreload
%autoreload 2
%matplotlib inline
from matplotlib import rcParams, pyplot as plt
import numpy as np
import os
import pandas as pd
from pathlib import Path
import re
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, GlobalMaxPooling1D, Conv1D, Dropout, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.optimizers import Adam
import warnings
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
# Restrict TensorFlow to only use the first GPU
tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
logical_gpus = tf.config.experimental.list_logical_devices('GPU')
print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
except RuntimeError as e:
# Visible devices must be set before GPUs have been initialized
print('No GPU detected')
1 Physical GPUs, 1 Logical GPU
rcParams['figure.figsize'] = (16, 8)
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
학습데이터 로드¶
data_dir = Path('../data/dacon-author-classification')
feature_dir = Path('../build/feature')
val_dir = Path('../build/val')
tst_dir = Path('../build/tst')
sub_dir = Path('../build/sub')
dirs = [feature_dir, val_dir, tst_dir, sub_dir]
for d in dirs:
os.makedirs(d, exist_ok=True)
trn_file = data_dir / 'train.csv'
tst_file = data_dir / 'test_x.csv'
sample_file = data_dir / 'sample_submission.csv'
target_col = 'author'
n_fold = 5
n_class = 5
seed = 42
algo_name = 'lstm'
feature_name = 'emb'
model_name = f'{algo_name}_{feature_name}'
feature_file = feature_dir / f'{feature_name}.csv'
p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'
sub_file = sub_dir / f'{model_name}.csv'
train = pd.read_csv(trn_file, index_col=0)
index | text | author | |
0 | 0 | He was almost choking. There was so much, so m... | 3 |
1 | 1 | “Your sister asked for it, I suppose?” | 2 |
2 | 2 | She was engaged one day as she walked, in per... | 1 |
3 | 3 | The captain was in the porch, keeping himself ... | 4 |
4 | 4 | “Have mercy, gentlemen!” odin flung up his han... | 3 |
test = pd.read_csv(tst_file, index_col=0)
index | text | |
0 | 0 | “Not at all. I think she is one of the most ch... |
1 | 1 | "No," replied he, with sudden consciousness, "... |
2 | 2 | As the lady had stated her intention of scream... |
3 | 3 | “And then suddenly in the silence I heard a so... |
4 | 4 | His conviction remained unchanged. So far as I... |
def alpha_num(text):
return re.sub(r'[^A-Za-z0-9 ]', '', text)
def remove_stopwords(text):
final_text = []
for i in text.split():
if i.strip().lower() not in stopwords:
return " ".join(final_text)
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as",
"at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could",
"did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has",
"have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself",
"his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself",
"let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours",
"ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that",
"that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll",
"they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll",
"we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom",
"why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
train['text'] = train['text'].str.lower().apply(alpha_num).apply(remove_stopwords)
test['text'] = test['text'].str.lower().apply(alpha_num).apply(remove_stopwords)
X_train = train['text'].values
X_test = test['text'].values
y = train['author'].values
print(X_train.shape, X_test.shape, y.shape)
(54879,) (19617,) (54879,)
array(['almost choking much much wanted say strange exclamations came lips pole gazed fixedly bundle notes hand looked odin evident perplexity',
'sister asked suppose',
'engaged one day walked perusing janes last letter dwelling passages proved jane not written spirits instead surprised mr odin saw looking odin meeting putting away letter immediately forcing smile said'],
vocab_size = 20000
embedding_dim = 64
max_length = 500
tokenizer = Tokenizer(num_words = vocab_size)
word_index = tokenizer.word_index
train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)
trn = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)
tst = pad_sequences(test_sequences, padding=padding_type, maxlen=max_length)
print(trn.shape, tst.shape)
(54879, 500) (19617, 500)
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
def get_model():
model = Sequential([
Embedding(vocab_size, embedding_dim, input_length=max_length),
Bidirectional(LSTM(64, return_sequences=True)),
Dense(n_class, activation='softmax')
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=.01))
return model
p_val = np.zeros((trn.shape[0], n_class))
p_tst = np.zeros((tst.shape[0], n_class))
for i, (i_trn, i_val) in enumerate(cv.split(trn, y), 1):
print(f'training model for CV #{i}')
clf = get_model()
es = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3,
verbose=1, mode='min', baseline=None, restore_best_weights=True)
validation_data=(trn[i_val], to_categorical(y[i_val])),
p_val[i_val, :] = clf.predict(trn[i_val])
p_tst += clf.predict(tst) / n_fold
print(f'Accuracy (CV): {accuracy_score(y, np.argmax(p_val, axis=1)) * 100:8.4f}%')
print(f'Log Loss (CV): {log_loss(pd.get_dummies(y), p_val):8.4f}')
Accuracy (CV): 73.1846%
Log Loss (CV): 0.7312
np.savetxt(p_val_file, p_val, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, p_tst, fmt='%.6f', delimiter=',')
Model: "sequential_4"
Layer (type) Output Shape Param #
embedding_4 (Embedding) (None, 500, 64) 1280000
bidirectional_8 (Bidirection (None, 500, 128) 66048
bidirectional_9 (Bidirection (None, 128) 98816
dense_4 (Dense) (None, 5) 645
Total params: 1,445,509
Trainable params: 1,445,509
Non-trainable params: 0
제출 파일 생성¶
sub = pd.read_csv(sample_file, index_col=0)
(19617, 5)
0 | 1 | 2 | 3 | 4 | |
index | |||||
0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 0 |
sub[sub.columns] = p_tst
0 | 1 | 2 | 3 | 4 | |
index | |||||
0 | 0.0407 | 0.5616 | 0.3168 | 0.0673 | 0.0136 |
1 | 0.0780 | 0.6488 | 0.0074 | 0.0058 | 0.2600 |
2 | 0.9714 | 0.0211 | 0.0024 | 0.0025 | 0.0026 |
3 | 0.0349 | 0.0148 | 0.8122 | 0.0199 | 0.1181 |
4 | 0.6979 | 0.0773 | 0.0068 | 0.1799 | 0.0381 |