데모¶
라이브러리 import 및 설정¶
%reload_ext autoreload
%autoreload 2
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import rcParams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
from pathlib import Path
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
import seaborn as sns
import warnings
rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
warnings.simplefilter('ignore')
학습데이터 로드¶
data_dir = Path('../data/dacon-author-classification')
feature_dir = Path('../build/feature')
val_dir = Path('../build/val')
tst_dir = Path('../build/tst')
sub_dir = Path('../build/sub')
trn_file = data_dir / 'train.csv'
tst_file = data_dir / 'test_x.csv'
sample_file = data_dir / 'sample_submission.csv'
target_col = 'author'
n_fold = 5
n_class = 5
seed = 42
algo_name = 'lr'
feature_name = 'tfidf'
model_name = f'{algo_name}_{feature_name}'
feature_file = feature_dir / f'{feature_name}.csv'
p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'
sub_file = sub_dir / f'{model_name}.csv'
trn = pd.read_csv(trn_file, index_col=0)
print(trn.shape)
trn.head()
(54879, 2)
text | author | |
---|---|---|
index | ||
0 | He was almost choking. There was so much, so m... | 3 |
1 | “Your sister asked for it, I suppose?” | 2 |
2 | She was engaged one day as she walked, in per... | 1 |
3 | The captain was in the porch, keeping himself ... | 4 |
4 | “Have mercy, gentlemen!” odin flung up his han... | 3 |
tst = pd.read_csv(tst_file, index_col=0)
print(tst.shape)
tst.head()
(19617, 1)
text | |
---|---|
index | |
0 | “Not at all. I think she is one of the most ch... |
1 | "No," replied he, with sudden consciousness, "... |
2 | As the lady had stated her intention of scream... |
3 | “And then suddenly in the silence I heard a so... |
4 | His conviction remained unchanged. So far as I... |
NLTK 예시¶
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
s = trn.text[4]
print(s)
“Have mercy, gentlemen!” odin flung up his hands. “Don’t write that, anyway; have some shame. Here I’ve torn my heart asunder before you, and you seize the opportunity and are fingering the wounds in both halves.... Oh, my God!”
tokens = word_tokenize(s)
print(tokens)
['“', 'Have', 'mercy', ',', 'gentlemen', '!', '”', 'odin', 'flung', 'up', 'his', 'hands', '.', '“', 'Don', '’', 't', 'write', 'that', ',', 'anyway', ';', 'have', 'some', 'shame', '.', 'Here', 'I', '’', 've', 'torn', 'my', 'heart', 'asunder', 'before', 'you', ',', 'and', 'you', 'seize', 'the', 'opportunity', 'and', 'are', 'fingering', 'the', 'wounds', 'in', 'both', 'halves', '....', 'Oh', ',', 'my', 'God', '!', '”']
lemmatizer = WordNetLemmatizer()
[lemmatizer.lemmatize(t) for t in tokens]
['“',
'Have',
'mercy',
',',
'gentleman',
'!',
'”',
'odin',
'flung',
'up',
'his',
'hand',
'.',
'“',
'Don',
'’',
't',
'write',
'that',
',',
'anyway',
';',
'have',
'some',
'shame',
'.',
'Here',
'I',
'’',
've',
'torn',
'my',
'heart',
'asunder',
'before',
'you',
',',
'and',
'you',
'seize',
'the',
'opportunity',
'and',
'are',
'fingering',
'the',
'wound',
'in',
'both',
'half',
'....',
'Oh',
',',
'my',
'God',
'!',
'”']
stemmer = SnowballStemmer("english")
[stemmer.stem(t) for t in tokens]
['“',
'have',
'merci',
',',
'gentlemen',
'!',
'”',
'odin',
'flung',
'up',
'his',
'hand',
'.',
'“',
'don',
'’',
't',
'write',
'that',
',',
'anyway',
';',
'have',
'some',
'shame',
'.',
'here',
'i',
'’',
've',
'torn',
'my',
'heart',
'asund',
'befor',
'you',
',',
'and',
'you',
'seiz',
'the',
'opportun',
'and',
'are',
'finger',
'the',
'wound',
'in',
'both',
'halv',
'....',
'oh',
',',
'my',
'god',
'!',
'”']
Bag-of-Words 피처 생성¶
vec = CountVectorizer(tokenizer=word_tokenize, stop_words=stopwords.words('english'), ngram_range=(1, 2), min_df=100)
X_cnt = vec.fit_transform(trn['text'])
print(X_cnt.shape)
(54879, 2685)
X_cnt[0, :50].todense()
matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0]])
vec = TfidfVectorizer(tokenizer=word_tokenize, stop_words=stopwords.words('english'), ngram_range=(1, 3), min_df=50)
X = vec.fit_transform(trn['text'])
X_tst = vec.transform(tst['text'])
print(X.shape, X_tst.shape)
(54879, 5897) (19617, 5897)
X[0, :50].todense()
matrix([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0.]])
로지스틱회귀 모델 학습¶
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
y = trn.author.values
y.shape
(54879,)
p = np.zeros((X.shape[0], n_class))
p_tst = np.zeros((X_tst.shape[0], n_class))
for i_cv, (i_trn, i_val) in enumerate(cv.split(X, y), 1):
clf = LogisticRegression()
clf.fit(X[i_trn], y[i_trn])
p[i_val, :] = clf.predict_proba(X[i_val])
p_tst += clf.predict_proba(X_tst) / n_class
print(f'Accuracy (CV): {accuracy_score(y, np.argmax(p, axis=1)) * 100:8.4f}%')
print(f'Log Loss (CV): {log_loss(pd.get_dummies(y), p):8.4f}')
Accuracy (CV): 76.6140%
Log Loss (CV): 0.6800
np.savetxt(p_val_file, p, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, p_tst, fmt='%.6f', delimiter=',')
제출 파일 생성¶
sub = pd.read_csv(sample_file, index_col=0)
print(sub.shape)
sub.head()
(19617, 5)
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
index | |||||
0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 0 |
sub[sub.columns] = p_tst
sub.head()
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
index | |||||
0 | 0.0631 | 0.5302 | 0.3155 | 0.0659 | 0.0253 |
1 | 0.0815 | 0.8202 | 0.0032 | 0.0269 | 0.0682 |
2 | 0.7208 | 0.0319 | 0.1174 | 0.0381 | 0.0918 |
3 | 0.0392 | 0.0036 | 0.8465 | 0.0058 | 0.1049 |
4 | 0.3044 | 0.2440 | 0.1450 | 0.1905 | 0.1161 |
sub.to_csv(sub_file)