데모¶
라이브러리 import 및 설정¶
%reload_ext autoreload
%autoreload 2
%matplotlib inline
from matplotlib import rcParams, pyplot as plt
import numpy as np
import os
import pandas as pd
from pathlib import Path
import re
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.initializers import Constant
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, LSTM, Dropout
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.optimizers import Adam
import warnings
warnings.filterwarnings(action='ignore')
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
# Restrict TensorFlow to only use the first GPU
try:
tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
logical_gpus = tf.config.experimental.list_logical_devices('GPU')
print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
except RuntimeError as e:
# Visible devices must be set before GPUs have been initialized
print(e)
else:
print('No GPU detected')
No GPU detected
rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
warnings.simplefilter('ignore')
GloVe 임베딩 로드¶
http://nlp.stanford.edu/data/glove.6B.zip 를 다운받아 data_dir
에 압축을 푼다.
data_dir = Path('../data/dacon-author-classification')
feature_dir = Path('../build/feature')
val_dir = Path('../build/val')
tst_dir = Path('../build/tst')
sub_dir = Path('../build/sub')
dirs = [feature_dir, val_dir, tst_dir, sub_dir]
for d in dirs:
os.makedirs(d, exist_ok=True)
trn_file = data_dir / 'train.csv'
tst_file = data_dir / 'test_x.csv'
sample_file = data_dir / 'sample_submission.csv'
glove_file = data_dir / 'glove.6B.100d.txt'
target_col = 'author'
n_fold = 5
n_class = 5
seed = 42
algo_name = 'lstm'
feature_name = 'glove'
model_name = f'{algo_name}_{feature_name}'
feature_file = feature_dir / f'{feature_name}.csv'
p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'
sub_file = sub_dir / f'{model_name}.csv'
embeddings_index = {}
with open(glove_file) as f:
for line in f:
word, coefs = line.split(maxsplit=1)
coefs = np.fromstring(coefs, "f", sep=" ")
embeddings_index[word] = coefs
print(f'Found {len(embeddings_index)} word vectors.')
Found 400000 word vectors.
학습데이터 로드¶
train = pd.read_csv(trn_file, index_col=0)
train.head()
text | author | |
---|---|---|
index | ||
0 | He was almost choking. There was so much, so m... | 3 |
1 | “Your sister asked for it, I suppose?” | 2 |
2 | She was engaged one day as she walked, in per... | 1 |
3 | The captain was in the porch, keeping himself ... | 4 |
4 | “Have mercy, gentlemen!” odin flung up his han... | 3 |
test = pd.read_csv(tst_file, index_col=0)
test.head()
text | |
---|---|
index | |
0 | “Not at all. I think she is one of the most ch... |
1 | "No," replied he, with sudden consciousness, "... |
2 | As the lady had stated her intention of scream... |
3 | “And then suddenly in the silence I heard a so... |
4 | His conviction remained unchanged. So far as I... |
Preprocessing¶
def alpha_num(text):
return re.sub(r'[^A-Za-z0-9 ]', '', text)
def remove_stopwords(text):
final_text = []
for i in text.split():
if i.strip().lower() not in stopwords:
final_text.append(i.strip())
return " ".join(final_text)
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as",
"at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could",
"did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has",
"have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself",
"his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself",
"let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours",
"ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that",
"that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll",
"they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll",
"we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom",
"why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
train['text'] = train['text'].str.lower().apply(alpha_num).apply(remove_stopwords)
test['text'] = test['text'].str.lower().apply(alpha_num).apply(remove_stopwords)
trn = train['text'].values
tst = test['text'].values
y = train['author'].values
print(trn.shape, tst.shape, y.shape)
(54879,) (19617,) (54879,)
vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=500)
text_ds = tf.data.Dataset.from_tensor_slices(trn).batch(128)
vectorizer.adapt(text_ds)
vectorizer.get_vocabulary()[:5]
['', '[UNK]', 'odin', 'not', 'said']
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0
# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
# Words not found in embedding index will be all-zeros.
# This includes the representation for "padding" and "OOV"
embedding_matrix[i] = embedding_vector
hits += 1
else:
misses += 1
print(f"Converted {hits} words ({misses} misses)")
Converted 18141 words (1859 misses)
embedding_layer = Embedding(
num_tokens,
embedding_dim,
embeddings_initializer=Constant(embedding_matrix),
trainable=False,
)
Training¶
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
def get_model():
int_sequences_input = Input(shape=(1,), dtype=tf.string)
vectorized_sequences = vectorizer(int_sequences_input)
embedded_sequences = embedding_layer(vectorized_sequences)
x = Bidirectional(LSTM(64, return_sequences=True))(embedded_sequences)
x = Bidirectional(LSTM(64))(x)
preds = Dense(n_class, activation="softmax")(x)
model = Model(int_sequences_input, preds)
# compile model
model.compile(loss='categorical_crossentropy',
optimizer=Adam(learning_rate=.01))
return model
p_val = np.zeros((trn.shape[0], n_class))
p_tst = np.zeros((tst.shape[0], n_class))
for i, (i_trn, i_val) in enumerate(cv.split(trn, y), 1):
print(f'training model for CV #{i}')
es = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3,
verbose=1, mode='min', baseline=None, restore_best_weights=True)
clf = get_model()
clf.fit(trn[i_trn],
to_categorical(y[i_trn]),
validation_data=(trn[i_val], to_categorical(y[i_val])),
epochs=10,
batch_size=512,
callbacks=[es])
p_val[i_val, :] = clf.predict(trn[i_val])
p_tst += clf.predict(tst) / n_fold
training model for CV #1
Epoch 1/10
86/86 [==============================] - 422s 5s/step - loss: 1.2696 - val_loss: 1.0755
Epoch 2/10
86/86 [==============================] - 400s 5s/step - loss: 0.9967 - val_loss: 0.9633
Epoch 3/10
34/86 [==========>...................] - ETA: 3:40 - loss: 0.8673
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
<ipython-input-23-efba1e140569> in <module>
12 epochs=10,
13 batch_size=512,
---> 14 callbacks=[es])
15 p_val[i_val, :] = clf.predict(trn[i_val])
16 p_tst += clf.predict(tst) / n_fold
~/opt/miniconda3/envs/py37/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py in _method_wrapper(self, *args, **kwargs)
106 def _method_wrapper(self, *args, **kwargs):
107 if not self._in_multi_worker_mode(): # pylint: disable=protected-access
--> 108 return method(self, *args, **kwargs)
109
110 # Running inside `run_distribute_coordinator` already.
~/opt/miniconda3/envs/py37/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
1096 batch_size=batch_size):
1097 callbacks.on_train_batch_begin(step)
-> 1098 tmp_logs = train_function(iterator)
1099 if data_handler.should_sync:
1100 context.async_wait()
~/opt/miniconda3/envs/py37/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds)
778 else:
779 compiler = "nonXla"
--> 780 result = self._call(*args, **kwds)
781
782 new_tracing_count = self._get_tracing_count()
~/opt/miniconda3/envs/py37/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds)
805 # In this case we have created variables on the first call, so we run the
806 # defunned version which is guaranteed to never create variables.
--> 807 return self._stateless_fn(*args, **kwds) # pylint: disable=not-callable
808 elif self._stateful_fn is not None:
809 # Release the lock early so that multiple threads can perform the call
~/opt/miniconda3/envs/py37/lib/python3.7/site-packages/tensorflow/python/eager/function.py in __call__(self, *args, **kwargs)
2827 with self._lock:
2828 graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
-> 2829 return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access
2830
2831 @property
~/opt/miniconda3/envs/py37/lib/python3.7/site-packages/tensorflow/python/eager/function.py in _filtered_call(self, args, kwargs, cancellation_manager)
1846 resource_variable_ops.BaseResourceVariable))],
1847 captured_inputs=self.captured_inputs,
-> 1848 cancellation_manager=cancellation_manager)
1849
1850 def _call_flat(self, args, captured_inputs, cancellation_manager=None):
~/opt/miniconda3/envs/py37/lib/python3.7/site-packages/tensorflow/python/eager/function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
1922 # No tape is watching; skip to running the function.
1923 return self._build_call_outputs(self._inference_function.call(
-> 1924 ctx, args, cancellation_manager=cancellation_manager))
1925 forward_backward = self._select_forward_and_backward_functions(
1926 args,
~/opt/miniconda3/envs/py37/lib/python3.7/site-packages/tensorflow/python/eager/function.py in call(self, ctx, args, cancellation_manager)
548 inputs=args,
549 attrs=attrs,
--> 550 ctx=ctx)
551 else:
552 outputs = execute.execute_with_cancellation(
~/opt/miniconda3/envs/py37/lib/python3.7/site-packages/tensorflow/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
58 ctx.ensure_initialized()
59 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
---> 60 inputs, attrs, num_outputs)
61 except core._NotOkStatusException as e:
62 if name is not None:
KeyboardInterrupt:
print(f'Accuracy (CV): {accuracy_score(y, np.argmax(p_val, axis=1)) * 100:8.4f}%')
print(f'Log Loss (CV): {log_loss(pd.get_dummies(y), p_val):8.4f}')
np.savetxt(p_val_file, p_val, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, p_tst, fmt='%.6f', delimiter=',')
제출 파일 생성¶
sub = pd.read_csv(sample_file, index_col=0)
print(sub.shape)
sub.head()
sub[sub.columns] = p_tst
sub.head()
sub.to_csv(sub_file)