라이브러리 import 및 설정¶
%reload_ext autoreload
%autoreload 2
%matplotlib inline
from matplotlib import rcParams, pyplot as plt
import numpy as np
import os
import pandas as pd
from pathlib import Path
import re
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.initializers import Constant
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, LSTM, Dropout
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.optimizers import Adam
import warnings
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
# Restrict TensorFlow to only use the first GPU
tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
logical_gpus = tf.config.experimental.list_logical_devices('GPU')
print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
except RuntimeError as e:
# Visible devices must be set before GPUs have been initialized
print('No GPU detected')
No GPU detected
rcParams['figure.figsize'] = (16, 8)
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
GloVe 임베딩 로드¶
http://nlp.stanford.edu/data/glove.6B.zip 를 다운받아 data_dir
에 압축을 푼다.
data_dir = Path('../data/dacon-author-classification')
feature_dir = Path('../build/feature')
val_dir = Path('../build/val')
tst_dir = Path('../build/tst')
sub_dir = Path('../build/sub')
dirs = [feature_dir, val_dir, tst_dir, sub_dir]
for d in dirs:
os.makedirs(d, exist_ok=True)
trn_file = data_dir / 'train.csv'
tst_file = data_dir / 'test_x.csv'
sample_file = data_dir / 'sample_submission.csv'
glove_file = data_dir / 'glove.6B.100d.txt'
target_col = 'author'
n_fold = 5
n_class = 5
seed = 42
algo_name = 'lstm'
feature_name = 'glove'
model_name = f'{algo_name}_{feature_name}'
feature_file = feature_dir / f'{feature_name}.csv'
p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'
sub_file = sub_dir / f'{model_name}.csv'
embeddings_index = {}
with open(glove_file) as f:
for line in f:
word, coefs = line.split(maxsplit=1)
coefs = np.fromstring(coefs, "f", sep=" ")
embeddings_index[word] = coefs
print(f'Found {len(embeddings_index)} word vectors.')
Found 400000 word vectors.
학습데이터 로드¶
train = pd.read_csv(trn_file, index_col=0)
text | author | |
index | ||
0 | He was almost choking. There was so much, so m... | 3 |
1 | “Your sister asked for it, I suppose?” | 2 |
2 | She was engaged one day as she walked, in per... | 1 |
3 | The captain was in the porch, keeping himself ... | 4 |
4 | “Have mercy, gentlemen!” odin flung up his han... | 3 |
test = pd.read_csv(tst_file, index_col=0)
text | |
index | |
0 | “Not at all. I think she is one of the most ch... |
1 | "No," replied he, with sudden consciousness, "... |
2 | As the lady had stated her intention of scream... |
3 | “And then suddenly in the silence I heard a so... |
4 | His conviction remained unchanged. So far as I... |
def alpha_num(text):
return re.sub(r'[^A-Za-z0-9 ]', '', text)
def remove_stopwords(text):
final_text = []
for i in text.split():
if i.strip().lower() not in stopwords:
return " ".join(final_text)
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as",
"at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could",
"did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has",
"have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself",
"his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself",
"let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours",
"ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that",
"that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll",
"they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll",
"we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom",
"why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
train['text'] = train['text'].str.lower().apply(alpha_num).apply(remove_stopwords)
test['text'] = test['text'].str.lower().apply(alpha_num).apply(remove_stopwords)
trn = train['text'].values
tst = test['text'].values
y = train['author'].values
print(trn.shape, tst.shape, y.shape)
(54879,) (19617,) (54879,)
vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=500)
text_ds = tf.data.Dataset.from_tensor_slices(trn).batch(128)
['', '[UNK]', 'odin', 'not', 'said']
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0
# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
# Words not found in embedding index will be all-zeros.
# This includes the representation for "padding" and "OOV"
embedding_matrix[i] = embedding_vector
hits += 1
misses += 1
print(f"Converted {hits} words ({misses} misses)")
Converted 18141 words (1859 misses)
embedding_layer = Embedding(
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
def get_model():
int_sequences_input = Input(shape=(1,), dtype=tf.string)
vectorized_sequences = vectorizer(int_sequences_input)
embedded_sequences = embedding_layer(vectorized_sequences)
x = Bidirectional(LSTM(64, return_sequences=True))(embedded_sequences)
x = Bidirectional(LSTM(64))(x)
preds = Dense(n_class, activation="softmax")(x)
model = Model(int_sequences_input, preds)
# compile model
return model
p_val = np.zeros((trn.shape[0], n_class))
p_tst = np.zeros((tst.shape[0], n_class))
for i, (i_trn, i_val) in enumerate(cv.split(trn, y), 1):
print(f'training model for CV #{i}')
es = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3,
verbose=1, mode='min', baseline=None, restore_best_weights=True)
clf = get_model()
validation_data=(trn[i_val], to_categorical(y[i_val])),
p_val[i_val, :] = clf.predict(trn[i_val])
p_tst += clf.predict(tst) / n_fold
training model for CV #1
Epoch 1/10
86/86 [==============================] - 422s 5s/step - loss: 1.2696 - val_loss: 1.0755
Epoch 2/10
86/86 [==============================] - 400s 5s/step - loss: 0.9967 - val_loss: 0.9633
Epoch 3/10
34/86 [==========>...................] - ETA: 3:40 - loss: 0.8673
print(f'Accuracy (CV): {accuracy_score(y, np.argmax(p_val, axis=1)) * 100:8.4f}%')
print(f'Log Loss (CV): {log_loss(pd.get_dummies(y), p_val):8.4f}')
np.savetxt(p_val_file, p_val, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, p_tst, fmt='%.6f', delimiter=',')
제출 파일 생성¶
sub = pd.read_csv(sample_file, index_col=0)
sub[sub.columns] = p_tst