{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 데모"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 라이브러리 import 및 설정"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2020-11-09T04:32:22.350713Z",
"start_time": "2020-11-09T04:32:22.049823Z"
}
},
"outputs": [],
"source": [
"%reload_ext autoreload\n",
"%autoreload 2\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2020-11-09T04:32:23.188282Z",
"start_time": "2020-11-09T04:32:22.352714Z"
}
},
"outputs": [],
"source": [
"from matplotlib import pyplot as plt\n",
"from matplotlib import rcParams\n",
"from nltk.corpus import stopwords\n",
"from nltk.tokenize import word_tokenize\n",
"import numpy as np\n",
"from pathlib import Path\n",
"import pandas as pd\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.metrics import accuracy_score, log_loss\n",
"from sklearn.model_selection import StratifiedKFold\n",
"from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer\n",
"import seaborn as sns\n",
"import warnings"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2020-11-09T04:32:23.216883Z",
"start_time": "2020-11-09T04:32:23.190555Z"
}
},
"outputs": [],
"source": [
"rcParams['figure.figsize'] = (16, 8)\n",
"plt.style.use('fivethirtyeight')\n",
"pd.set_option('max_columns', 100)\n",
"pd.set_option(\"display.precision\", 4)\n",
"warnings.simplefilter('ignore')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 학습데이터 로드"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2020-11-09T04:32:23.250120Z",
"start_time": "2020-11-09T04:32:23.219024Z"
}
},
"outputs": [],
"source": [
"data_dir = Path('../data/dacon-author-classification')\n",
"feature_dir = Path('../build/feature')\n",
"val_dir = Path('../build/val')\n",
"tst_dir = Path('../build/tst')\n",
"sub_dir = Path('../build/sub')\n",
"\n",
"trn_file = data_dir / 'train.csv'\n",
"tst_file = data_dir / 'test_x.csv'\n",
"sample_file = data_dir / 'sample_submission.csv'\n",
"\n",
"target_col = 'author'\n",
"n_fold = 5\n",
"n_class = 5\n",
"seed = 42"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2020-11-09T04:32:23.282083Z",
"start_time": "2020-11-09T04:32:23.252439Z"
}
},
"outputs": [],
"source": [
"algo_name = 'lr'\n",
"feature_name = 'tfidf'\n",
"model_name = f'{algo_name}_{feature_name}'\n",
"\n",
"feature_file = feature_dir / f'{feature_name}.csv'\n",
"p_val_file = val_dir / f'{model_name}.val.csv'\n",
"p_tst_file = tst_dir / f'{model_name}.tst.csv'\n",
"sub_file = sub_dir / f'{model_name}.csv'"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2020-11-09T04:32:23.456017Z",
"start_time": "2020-11-09T04:32:23.283900Z"
},
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(54879, 2)\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" text | \n",
" author | \n",
"
\n",
" \n",
" index | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" He was almost choking. There was so much, so m... | \n",
" 3 | \n",
"
\n",
" \n",
" 1 | \n",
" “Your sister asked for it, I suppose?” | \n",
" 2 | \n",
"
\n",
" \n",
" 2 | \n",
" She was engaged one day as she walked, in per... | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" The captain was in the porch, keeping himself ... | \n",
" 4 | \n",
"
\n",
" \n",
" 4 | \n",
" “Have mercy, gentlemen!” odin flung up his han... | \n",
" 3 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" text author\n",
"index \n",
"0 He was almost choking. There was so much, so m... 3\n",
"1 “Your sister asked for it, I suppose?” 2\n",
"2 She was engaged one day as she walked, in per... 1\n",
"3 The captain was in the porch, keeping himself ... 4\n",
"4 “Have mercy, gentlemen!” odin flung up his han... 3"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"trn = pd.read_csv(trn_file, index_col=0)\n",
"print(trn.shape)\n",
"trn.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"ExecuteTime": {
"end_time": "2020-11-09T04:32:23.580127Z",
"start_time": "2020-11-09T04:32:23.458379Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(19617, 1)\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" text | \n",
"
\n",
" \n",
" index | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" “Not at all. I think she is one of the most ch... | \n",
"
\n",
" \n",
" 1 | \n",
" \"No,\" replied he, with sudden consciousness, \"... | \n",
"
\n",
" \n",
" 2 | \n",
" As the lady had stated her intention of scream... | \n",
"
\n",
" \n",
" 3 | \n",
" “And then suddenly in the silence I heard a so... | \n",
"
\n",
" \n",
" 4 | \n",
" His conviction remained unchanged. So far as I... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" text\n",
"index \n",
"0 “Not at all. I think she is one of the most ch...\n",
"1 \"No,\" replied he, with sudden consciousness, \"...\n",
"2 As the lady had stated her intention of scream...\n",
"3 “And then suddenly in the silence I heard a so...\n",
"4 His conviction remained unchanged. So far as I..."
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tst = pd.read_csv(tst_file, index_col=0)\n",
"print(tst.shape)\n",
"tst.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## NLTK 예시"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"ExecuteTime": {
"end_time": "2020-11-09T04:32:23.609289Z",
"start_time": "2020-11-09T04:32:23.583484Z"
}
},
"outputs": [],
"source": [
"from nltk.tokenize import word_tokenize\n",
"from nltk.stem import WordNetLemmatizer \n",
"from nltk.stem.snowball import SnowballStemmer"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"ExecuteTime": {
"end_time": "2020-11-09T04:32:23.645042Z",
"start_time": "2020-11-09T04:32:23.612849Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"“Have mercy, gentlemen!” odin flung up his hands. “Don’t write that, anyway; have some shame. Here I’ve torn my heart asunder before you, and you seize the opportunity and are fingering the wounds in both halves.... Oh, my God!”\n"
]
}
],
"source": [
"s = trn.text[4]\n",
"print(s)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"ExecuteTime": {
"end_time": "2020-11-09T04:32:23.682095Z",
"start_time": "2020-11-09T04:32:23.646719Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['“', 'Have', 'mercy', ',', 'gentlemen', '!', '”', 'odin', 'flung', 'up', 'his', 'hands', '.', '“', 'Don', '’', 't', 'write', 'that', ',', 'anyway', ';', 'have', 'some', 'shame', '.', 'Here', 'I', '’', 've', 'torn', 'my', 'heart', 'asunder', 'before', 'you', ',', 'and', 'you', 'seize', 'the', 'opportunity', 'and', 'are', 'fingering', 'the', 'wounds', 'in', 'both', 'halves', '....', 'Oh', ',', 'my', 'God', '!', '”']\n"
]
}
],
"source": [
"tokens = word_tokenize(s)\n",
"print(tokens)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"ExecuteTime": {
"end_time": "2020-11-09T04:32:25.650811Z",
"start_time": "2020-11-09T04:32:23.684004Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"['“',\n",
" 'Have',\n",
" 'mercy',\n",
" ',',\n",
" 'gentleman',\n",
" '!',\n",
" '”',\n",
" 'odin',\n",
" 'flung',\n",
" 'up',\n",
" 'his',\n",
" 'hand',\n",
" '.',\n",
" '“',\n",
" 'Don',\n",
" '’',\n",
" 't',\n",
" 'write',\n",
" 'that',\n",
" ',',\n",
" 'anyway',\n",
" ';',\n",
" 'have',\n",
" 'some',\n",
" 'shame',\n",
" '.',\n",
" 'Here',\n",
" 'I',\n",
" '’',\n",
" 've',\n",
" 'torn',\n",
" 'my',\n",
" 'heart',\n",
" 'asunder',\n",
" 'before',\n",
" 'you',\n",
" ',',\n",
" 'and',\n",
" 'you',\n",
" 'seize',\n",
" 'the',\n",
" 'opportunity',\n",
" 'and',\n",
" 'are',\n",
" 'fingering',\n",
" 'the',\n",
" 'wound',\n",
" 'in',\n",
" 'both',\n",
" 'half',\n",
" '....',\n",
" 'Oh',\n",
" ',',\n",
" 'my',\n",
" 'God',\n",
" '!',\n",
" '”']"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lemmatizer = WordNetLemmatizer()\n",
"[lemmatizer.lemmatize(t) for t in tokens]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"ExecuteTime": {
"end_time": "2020-11-09T04:32:25.688544Z",
"start_time": "2020-11-09T04:32:25.652709Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"['“',\n",
" 'have',\n",
" 'merci',\n",
" ',',\n",
" 'gentlemen',\n",
" '!',\n",
" '”',\n",
" 'odin',\n",
" 'flung',\n",
" 'up',\n",
" 'his',\n",
" 'hand',\n",
" '.',\n",
" '“',\n",
" 'don',\n",
" '’',\n",
" 't',\n",
" 'write',\n",
" 'that',\n",
" ',',\n",
" 'anyway',\n",
" ';',\n",
" 'have',\n",
" 'some',\n",
" 'shame',\n",
" '.',\n",
" 'here',\n",
" 'i',\n",
" '’',\n",
" 've',\n",
" 'torn',\n",
" 'my',\n",
" 'heart',\n",
" 'asund',\n",
" 'befor',\n",
" 'you',\n",
" ',',\n",
" 'and',\n",
" 'you',\n",
" 'seiz',\n",
" 'the',\n",
" 'opportun',\n",
" 'and',\n",
" 'are',\n",
" 'finger',\n",
" 'the',\n",
" 'wound',\n",
" 'in',\n",
" 'both',\n",
" 'halv',\n",
" '....',\n",
" 'oh',\n",
" ',',\n",
" 'my',\n",
" 'god',\n",
" '!',\n",
" '”']"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stemmer = SnowballStemmer(\"english\")\n",
"[stemmer.stem(t) for t in tokens]"
]
},
{
"cell_type": "markdown",
"metadata": {
"ExecuteTime": {
"end_time": "2020-11-04T14:54:47.082620Z",
"start_time": "2020-11-04T14:54:47.055487Z"
}
},
"source": [
"## Bag-of-Words 피처 생성"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"ExecuteTime": {
"end_time": "2020-11-09T04:32:49.605135Z",
"start_time": "2020-11-09T04:32:25.690283Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(54879, 2685)\n"
]
}
],
"source": [
"vec = CountVectorizer(tokenizer=word_tokenize, stop_words=stopwords.words('english'), ngram_range=(1, 2), min_df=100)\n",
"X_cnt = vec.fit_transform(trn['text'])\n",
"print(X_cnt.shape)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"ExecuteTime": {
"end_time": "2020-11-09T04:32:49.639498Z",
"start_time": "2020-11-09T04:32:49.606858Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0]])"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_cnt[0, :50].todense()"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {
"ExecuteTime": {
"end_time": "2020-11-09T04:45:10.310245Z",
"start_time": "2020-11-09T04:44:31.758835Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(54879, 5897) (19617, 5897)\n"
]
}
],
"source": [
"vec = TfidfVectorizer(tokenizer=word_tokenize, stop_words=stopwords.words('english'), ngram_range=(1, 3), min_df=50)\n",
"X = vec.fit_transform(trn['text'])\n",
"X_tst = vec.transform(tst['text'])\n",
"print(X.shape, X_tst.shape)"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {
"ExecuteTime": {
"end_time": "2020-11-09T04:45:10.342347Z",
"start_time": "2020-11-09T04:45:10.312078Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"matrix([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
" 0., 0.]])"
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X[0, :50].todense()"
]
},
{
"cell_type": "markdown",
"metadata": {
"ExecuteTime": {
"end_time": "2020-11-04T15:15:47.430701Z",
"start_time": "2020-11-04T15:15:47.404265Z"
}
},
"source": [
"## 로지스틱회귀 모델 학습"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {
"ExecuteTime": {
"end_time": "2020-11-09T04:45:10.370865Z",
"start_time": "2020-11-09T04:45:10.344734Z"
}
},
"outputs": [],
"source": [
"cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {
"ExecuteTime": {
"end_time": "2020-11-09T04:45:10.399912Z",
"start_time": "2020-11-09T04:45:10.373016Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(54879,)"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y = trn.author.values\n",
"y.shape"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {
"ExecuteTime": {
"end_time": "2020-11-09T04:45:30.682036Z",
"start_time": "2020-11-09T04:45:10.401772Z"
}
},
"outputs": [],
"source": [
"p = np.zeros((X.shape[0], n_class))\n",
"p_tst = np.zeros((X_tst.shape[0], n_class))\n",
"for i_cv, (i_trn, i_val) in enumerate(cv.split(X, y), 1):\n",
" clf = LogisticRegression()\n",
" clf.fit(X[i_trn], y[i_trn])\n",
" p[i_val, :] = clf.predict_proba(X[i_val])\n",
" p_tst += clf.predict_proba(X_tst) / n_class"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {
"ExecuteTime": {
"end_time": "2020-11-09T04:45:30.762135Z",
"start_time": "2020-11-09T04:45:30.684234Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy (CV): 76.6140%\n",
"Log Loss (CV): 0.6800\n"
]
}
],
"source": [
"print(f'Accuracy (CV): {accuracy_score(y, np.argmax(p, axis=1)) * 100:8.4f}%')\n",
"print(f'Log Loss (CV): {log_loss(pd.get_dummies(y), p):8.4f}')"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {
"ExecuteTime": {
"end_time": "2020-11-09T04:45:30.985540Z",
"start_time": "2020-11-09T04:45:30.763839Z"
}
},
"outputs": [],
"source": [
"np.savetxt(p_val_file, p, fmt='%.6f', delimiter=',')\n",
"np.savetxt(p_tst_file, p_tst, fmt='%.6f', delimiter=',')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 제출 파일 생성"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {
"ExecuteTime": {
"end_time": "2020-11-09T04:45:31.026905Z",
"start_time": "2020-11-09T04:45:30.988491Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(19617, 5)\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
"
\n",
" \n",
" index | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3 4\n",
"index \n",
"0 0 0 0 0 0\n",
"1 0 0 0 0 0\n",
"2 0 0 0 0 0\n",
"3 0 0 0 0 0\n",
"4 0 0 0 0 0"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sub = pd.read_csv(sample_file, index_col=0)\n",
"print(sub.shape)\n",
"sub.head()"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {
"ExecuteTime": {
"end_time": "2020-11-09T04:45:31.073372Z",
"start_time": "2020-11-09T04:45:31.028684Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
"
\n",
" \n",
" index | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0.0631 | \n",
" 0.5302 | \n",
" 0.3155 | \n",
" 0.0659 | \n",
" 0.0253 | \n",
"
\n",
" \n",
" 1 | \n",
" 0.0815 | \n",
" 0.8202 | \n",
" 0.0032 | \n",
" 0.0269 | \n",
" 0.0682 | \n",
"
\n",
" \n",
" 2 | \n",
" 0.7208 | \n",
" 0.0319 | \n",
" 0.1174 | \n",
" 0.0381 | \n",
" 0.0918 | \n",
"
\n",
" \n",
" 3 | \n",
" 0.0392 | \n",
" 0.0036 | \n",
" 0.8465 | \n",
" 0.0058 | \n",
" 0.1049 | \n",
"
\n",
" \n",
" 4 | \n",
" 0.3044 | \n",
" 0.2440 | \n",
" 0.1450 | \n",
" 0.1905 | \n",
" 0.1161 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3 4\n",
"index \n",
"0 0.0631 0.5302 0.3155 0.0659 0.0253\n",
"1 0.0815 0.8202 0.0032 0.0269 0.0682\n",
"2 0.7208 0.0319 0.1174 0.0381 0.0918\n",
"3 0.0392 0.0036 0.8465 0.0058 0.1049\n",
"4 0.3044 0.2440 0.1450 0.1905 0.1161"
]
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sub[sub.columns] = p_tst\n",
"sub.head()"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {
"ExecuteTime": {
"end_time": "2020-11-09T04:45:31.272596Z",
"start_time": "2020-11-09T04:45:31.074976Z"
}
},
"outputs": [],
"source": [
"sub.to_csv(sub_file)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": true,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {
"height": "calc(100% - 180px)",
"left": "10px",
"top": "150px",
"width": "222px"
},
"toc_section_display": true,
"toc_window_display": true
}
},
"nbformat": 4,
"nbformat_minor": 4
}