{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 데모"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 라이브러리 import 및 설정"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-11-09T04:32:22.350713Z",
     "start_time": "2020-11-09T04:32:22.049823Z"
    }
   },
   "outputs": [],
   "source": [
    "%reload_ext autoreload\n",
    "%autoreload 2\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-11-09T04:32:23.188282Z",
     "start_time": "2020-11-09T04:32:22.352714Z"
    }
   },
   "outputs": [],
   "source": [
    "from matplotlib import pyplot as plt\n",
    "from matplotlib import rcParams\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.tokenize import word_tokenize\n",
    "import numpy as np\n",
    "from pathlib import Path\n",
    "import pandas as pd\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import accuracy_score, log_loss\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer\n",
    "import seaborn as sns\n",
    "import warnings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-11-09T04:32:23.216883Z",
     "start_time": "2020-11-09T04:32:23.190555Z"
    }
   },
   "outputs": [],
   "source": [
    "rcParams['figure.figsize'] = (16, 8)\n",
    "plt.style.use('fivethirtyeight')\n",
    "pd.set_option('max_columns', 100)\n",
    "pd.set_option(\"display.precision\", 4)\n",
    "warnings.simplefilter('ignore')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 학습데이터 로드"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-11-09T04:32:23.250120Z",
     "start_time": "2020-11-09T04:32:23.219024Z"
    }
   },
   "outputs": [],
   "source": [
    "data_dir = Path('../data/dacon-author-classification')\n",
    "feature_dir = Path('../build/feature')\n",
    "val_dir = Path('../build/val')\n",
    "tst_dir = Path('../build/tst')\n",
    "sub_dir = Path('../build/sub')\n",
    "\n",
    "trn_file = data_dir / 'train.csv'\n",
    "tst_file = data_dir / 'test_x.csv'\n",
    "sample_file = data_dir / 'sample_submission.csv'\n",
    "\n",
    "target_col = 'author'\n",
    "n_fold = 5\n",
    "n_class = 5\n",
    "seed = 42"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-11-09T04:32:23.282083Z",
     "start_time": "2020-11-09T04:32:23.252439Z"
    }
   },
   "outputs": [],
   "source": [
    "algo_name = 'lr'\n",
    "feature_name = 'tfidf'\n",
    "model_name = f'{algo_name}_{feature_name}'\n",
    "\n",
    "feature_file = feature_dir / f'{feature_name}.csv'\n",
    "p_val_file = val_dir / f'{model_name}.val.csv'\n",
    "p_tst_file = tst_dir / f'{model_name}.tst.csv'\n",
    "sub_file = sub_dir / f'{model_name}.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-11-09T04:32:23.456017Z",
     "start_time": "2020-11-09T04:32:23.283900Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(54879, 2)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>author</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>index</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>He was almost choking. There was so much, so m...</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>“Your sister asked for it, I suppose?”</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>She was engaged one day as she walked, in per...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>The captain was in the porch, keeping himself ...</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>“Have mercy, gentlemen!” odin flung up his han...</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                    text  author\n",
       "index                                                           \n",
       "0      He was almost choking. There was so much, so m...       3\n",
       "1                 “Your sister asked for it, I suppose?”       2\n",
       "2       She was engaged one day as she walked, in per...       1\n",
       "3      The captain was in the porch, keeping himself ...       4\n",
       "4      “Have mercy, gentlemen!” odin flung up his han...       3"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trn = pd.read_csv(trn_file, index_col=0)\n",
    "print(trn.shape)\n",
    "trn.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-11-09T04:32:23.580127Z",
     "start_time": "2020-11-09T04:32:23.458379Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(19617, 1)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>index</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>“Not at all. I think she is one of the most ch...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>\"No,\" replied he, with sudden consciousness, \"...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>As the lady had stated her intention of scream...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>“And then suddenly in the silence I heard a so...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>His conviction remained unchanged. So far as I...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                    text\n",
       "index                                                   \n",
       "0      “Not at all. I think she is one of the most ch...\n",
       "1      \"No,\" replied he, with sudden consciousness, \"...\n",
       "2      As the lady had stated her intention of scream...\n",
       "3      “And then suddenly in the silence I heard a so...\n",
       "4      His conviction remained unchanged. So far as I..."
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tst = pd.read_csv(tst_file, index_col=0)\n",
    "print(tst.shape)\n",
    "tst.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## NLTK 예시"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-11-09T04:32:23.609289Z",
     "start_time": "2020-11-09T04:32:23.583484Z"
    }
   },
   "outputs": [],
   "source": [
    "from nltk.tokenize import word_tokenize\n",
    "from nltk.stem import WordNetLemmatizer \n",
    "from nltk.stem.snowball import SnowballStemmer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-11-09T04:32:23.645042Z",
     "start_time": "2020-11-09T04:32:23.612849Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "“Have mercy, gentlemen!” odin flung up his hands. “Don’t write that, anyway; have some shame. Here I’ve torn my heart asunder before you, and you seize the opportunity and are fingering the wounds in both halves.... Oh, my God!”\n"
     ]
    }
   ],
   "source": [
    "s = trn.text[4]\n",
    "print(s)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-11-09T04:32:23.682095Z",
     "start_time": "2020-11-09T04:32:23.646719Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['“', 'Have', 'mercy', ',', 'gentlemen', '!', '”', 'odin', 'flung', 'up', 'his', 'hands', '.', '“', 'Don', '’', 't', 'write', 'that', ',', 'anyway', ';', 'have', 'some', 'shame', '.', 'Here', 'I', '’', 've', 'torn', 'my', 'heart', 'asunder', 'before', 'you', ',', 'and', 'you', 'seize', 'the', 'opportunity', 'and', 'are', 'fingering', 'the', 'wounds', 'in', 'both', 'halves', '....', 'Oh', ',', 'my', 'God', '!', '”']\n"
     ]
    }
   ],
   "source": [
    "tokens = word_tokenize(s)\n",
    "print(tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-11-09T04:32:25.650811Z",
     "start_time": "2020-11-09T04:32:23.684004Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['“',\n",
       " 'Have',\n",
       " 'mercy',\n",
       " ',',\n",
       " 'gentleman',\n",
       " '!',\n",
       " '”',\n",
       " 'odin',\n",
       " 'flung',\n",
       " 'up',\n",
       " 'his',\n",
       " 'hand',\n",
       " '.',\n",
       " '“',\n",
       " 'Don',\n",
       " '’',\n",
       " 't',\n",
       " 'write',\n",
       " 'that',\n",
       " ',',\n",
       " 'anyway',\n",
       " ';',\n",
       " 'have',\n",
       " 'some',\n",
       " 'shame',\n",
       " '.',\n",
       " 'Here',\n",
       " 'I',\n",
       " '’',\n",
       " 've',\n",
       " 'torn',\n",
       " 'my',\n",
       " 'heart',\n",
       " 'asunder',\n",
       " 'before',\n",
       " 'you',\n",
       " ',',\n",
       " 'and',\n",
       " 'you',\n",
       " 'seize',\n",
       " 'the',\n",
       " 'opportunity',\n",
       " 'and',\n",
       " 'are',\n",
       " 'fingering',\n",
       " 'the',\n",
       " 'wound',\n",
       " 'in',\n",
       " 'both',\n",
       " 'half',\n",
       " '....',\n",
       " 'Oh',\n",
       " ',',\n",
       " 'my',\n",
       " 'God',\n",
       " '!',\n",
       " '”']"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lemmatizer = WordNetLemmatizer()\n",
    "[lemmatizer.lemmatize(t) for t in tokens]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-11-09T04:32:25.688544Z",
     "start_time": "2020-11-09T04:32:25.652709Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['“',\n",
       " 'have',\n",
       " 'merci',\n",
       " ',',\n",
       " 'gentlemen',\n",
       " '!',\n",
       " '”',\n",
       " 'odin',\n",
       " 'flung',\n",
       " 'up',\n",
       " 'his',\n",
       " 'hand',\n",
       " '.',\n",
       " '“',\n",
       " 'don',\n",
       " '’',\n",
       " 't',\n",
       " 'write',\n",
       " 'that',\n",
       " ',',\n",
       " 'anyway',\n",
       " ';',\n",
       " 'have',\n",
       " 'some',\n",
       " 'shame',\n",
       " '.',\n",
       " 'here',\n",
       " 'i',\n",
       " '’',\n",
       " 've',\n",
       " 'torn',\n",
       " 'my',\n",
       " 'heart',\n",
       " 'asund',\n",
       " 'befor',\n",
       " 'you',\n",
       " ',',\n",
       " 'and',\n",
       " 'you',\n",
       " 'seiz',\n",
       " 'the',\n",
       " 'opportun',\n",
       " 'and',\n",
       " 'are',\n",
       " 'finger',\n",
       " 'the',\n",
       " 'wound',\n",
       " 'in',\n",
       " 'both',\n",
       " 'halv',\n",
       " '....',\n",
       " 'oh',\n",
       " ',',\n",
       " 'my',\n",
       " 'god',\n",
       " '!',\n",
       " '”']"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stemmer = SnowballStemmer(\"english\")\n",
    "[stemmer.stem(t) for t in tokens]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-11-04T14:54:47.082620Z",
     "start_time": "2020-11-04T14:54:47.055487Z"
    }
   },
   "source": [
    "## Bag-of-Words 피처 생성"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-11-09T04:32:49.605135Z",
     "start_time": "2020-11-09T04:32:25.690283Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(54879, 2685)\n"
     ]
    }
   ],
   "source": [
    "vec = CountVectorizer(tokenizer=word_tokenize, stop_words=stopwords.words('english'), ngram_range=(1, 2), min_df=100)\n",
    "X_cnt = vec.fit_transform(trn['text'])\n",
    "print(X_cnt.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-11-09T04:32:49.639498Z",
     "start_time": "2020-11-09T04:32:49.606858Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0,\n",
       "         0, 0, 0, 0, 0, 0, 0, 0]])"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_cnt[0, :50].todense()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-11-09T04:45:10.310245Z",
     "start_time": "2020-11-09T04:44:31.758835Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(54879, 5897) (19617, 5897)\n"
     ]
    }
   ],
   "source": [
    "vec = TfidfVectorizer(tokenizer=word_tokenize, stop_words=stopwords.words('english'), ngram_range=(1, 3), min_df=50)\n",
    "X = vec.fit_transform(trn['text'])\n",
    "X_tst = vec.transform(tst['text'])\n",
    "print(X.shape, X_tst.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-11-09T04:45:10.342347Z",
     "start_time": "2020-11-09T04:45:10.312078Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "matrix([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
       "         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
       "         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
       "         0., 0.]])"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X[0, :50].todense()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-11-04T15:15:47.430701Z",
     "start_time": "2020-11-04T15:15:47.404265Z"
    }
   },
   "source": [
    "## 로지스틱회귀 모델 학습"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-11-09T04:45:10.370865Z",
     "start_time": "2020-11-09T04:45:10.344734Z"
    }
   },
   "outputs": [],
   "source": [
    "cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-11-09T04:45:10.399912Z",
     "start_time": "2020-11-09T04:45:10.373016Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(54879,)"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y = trn.author.values\n",
    "y.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-11-09T04:45:30.682036Z",
     "start_time": "2020-11-09T04:45:10.401772Z"
    }
   },
   "outputs": [],
   "source": [
    "p = np.zeros((X.shape[0], n_class))\n",
    "p_tst = np.zeros((X_tst.shape[0], n_class))\n",
    "for i_cv, (i_trn, i_val) in enumerate(cv.split(X, y), 1):\n",
    "    clf = LogisticRegression()\n",
    "    clf.fit(X[i_trn], y[i_trn])\n",
    "    p[i_val, :] = clf.predict_proba(X[i_val])\n",
    "    p_tst += clf.predict_proba(X_tst) / n_class"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-11-09T04:45:30.762135Z",
     "start_time": "2020-11-09T04:45:30.684234Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy (CV):  76.6140%\n",
      "Log Loss (CV):   0.6800\n"
     ]
    }
   ],
   "source": [
    "print(f'Accuracy (CV): {accuracy_score(y, np.argmax(p, axis=1)) * 100:8.4f}%')\n",
    "print(f'Log Loss (CV): {log_loss(pd.get_dummies(y), p):8.4f}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-11-09T04:45:30.985540Z",
     "start_time": "2020-11-09T04:45:30.763839Z"
    }
   },
   "outputs": [],
   "source": [
    "np.savetxt(p_val_file, p, fmt='%.6f', delimiter=',')\n",
    "np.savetxt(p_tst_file, p_tst, fmt='%.6f', delimiter=',')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 제출 파일 생성"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-11-09T04:45:31.026905Z",
     "start_time": "2020-11-09T04:45:30.988491Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(19617, 5)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>index</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       0  1  2  3  4\n",
       "index               \n",
       "0      0  0  0  0  0\n",
       "1      0  0  0  0  0\n",
       "2      0  0  0  0  0\n",
       "3      0  0  0  0  0\n",
       "4      0  0  0  0  0"
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sub = pd.read_csv(sample_file, index_col=0)\n",
    "print(sub.shape)\n",
    "sub.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-11-09T04:45:31.073372Z",
     "start_time": "2020-11-09T04:45:31.028684Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>index</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0631</td>\n",
       "      <td>0.5302</td>\n",
       "      <td>0.3155</td>\n",
       "      <td>0.0659</td>\n",
       "      <td>0.0253</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.0815</td>\n",
       "      <td>0.8202</td>\n",
       "      <td>0.0032</td>\n",
       "      <td>0.0269</td>\n",
       "      <td>0.0682</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.7208</td>\n",
       "      <td>0.0319</td>\n",
       "      <td>0.1174</td>\n",
       "      <td>0.0381</td>\n",
       "      <td>0.0918</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0392</td>\n",
       "      <td>0.0036</td>\n",
       "      <td>0.8465</td>\n",
       "      <td>0.0058</td>\n",
       "      <td>0.1049</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.3044</td>\n",
       "      <td>0.2440</td>\n",
       "      <td>0.1450</td>\n",
       "      <td>0.1905</td>\n",
       "      <td>0.1161</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            0       1       2       3       4\n",
       "index                                        \n",
       "0      0.0631  0.5302  0.3155  0.0659  0.0253\n",
       "1      0.0815  0.8202  0.0032  0.0269  0.0682\n",
       "2      0.7208  0.0319  0.1174  0.0381  0.0918\n",
       "3      0.0392  0.0036  0.8465  0.0058  0.1049\n",
       "4      0.3044  0.2440  0.1450  0.1905  0.1161"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sub[sub.columns] = p_tst\n",
    "sub.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-11-09T04:45:31.272596Z",
     "start_time": "2020-11-09T04:45:31.074976Z"
    }
   },
   "outputs": [],
   "source": [
    "sub.to_csv(sub_file)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": true,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "222px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}