{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"accelerator": "GPU",
"colab": {
   "name": "Keras_MLP_BBC_dataset.ipynb",
   "provenance": [],
   "collapsed_sections": []
},
"kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
},
"language_info": {
   "codemirror_mode": {
   "name": "ipython",
   "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
}
},
"cells": [
{
   "cell_type": "markdown",
   "metadata": {
   "id": "5AmUvXZILnKx"
   },
   "source": [
   "### Import libraries"
   ]
},
{
   "cell_type": "code",
   "metadata": {
   "id": "n2utcVcCVE7-"
   },
   "source": [
   "import pandas as pd\n",
   "import numpy as np\n",
   "import sklearn\n",
   "import seaborn as sn\n",
   "import matplotlib.pyplot as plt\n",
   "from sklearn.metrics import accuracy_score\n",
   "from sklearn.metrics import confusion_matrix\n",
   "from sklearn.model_selection import train_test_split\n",
   "from sklearn.feature_extraction.text import TfidfVectorizer\n",
   "\n",
   "from sklearn.naive_bayes import MultinomialNB\n",
   "from sklearn.ensemble import RandomForestClassifier\n",
   "from sklearn import svm\n",
   "from sklearn.linear_model import LogisticRegression\n",
   "from sklearn.model_selection import GridSearchCV\n",
   "\n",
   "from sklearn.manifold import TSNE\n",
   "from sklearn.metrics import classification_report\n",
   "from numpy.random import seed\n",
   "seed(1)\n",
   "import warnings\n",
   "warnings.filterwarnings('ignore')"
   ],
   "execution_count": 188,
   "outputs": []
},
{
   "cell_type": "markdown",
   "metadata": {
   "id": "5Kv1CwnJMfUD"
   },
   "source": [
   "### create_dataframe() function implemented below converts all the txt files present in the folders to the pandas dataframe by first converting them to the raw text using the load_data function.\n",
   "\n",
   "<br>\n",
   "\n",
   "### This way we can perform the classification with all the datasets present in a single pandas dataframe"
   ]
},
{
   "cell_type": "code",
   "metadata": {
   "id": "YNxx7Tt6VLIg"
   },
   "source": [
   "def create_dataframe(): \n",
   " data = pd.read_csv('BBC News Train.csv')\n",
   " data['category_id'] = data['Category'].factorize()[0]\n",
   " category_id_df = data[['Category', 'category_id']].drop_duplicates().sort_values('category_id')\n",
   " return data, category_id_df"
   ],
   "execution_count": 189,
   "outputs": []
},
{
   "cell_type": "code",
   "metadata": {
   "id": "EgHpdSdzCCkJ"
   },
   "source": [
   "def preprocessing(df):\n",
   " tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')\n",
   " features = tfidf.fit_transform(df.Text).toarray() \n",
   " labels = df.category_id\n",
   " return tfidf, features, labels"
   ],
   "execution_count": 190,
   "outputs": []
},
{
   "cell_type": "markdown",
   "metadata": {
   "id": "sRfUPuQRRz94"
   },
   "source": [
   "find_retrieve function simply finds the retrieve value for each class using the confusion matrix"
   ]
},
{
   "cell_type": "code",
   "metadata": {
   "id": "GQVzLsUfqlQc"
   },
   "source": [
   "def find_retrieve(cm,id_to_type):\n",
   " retrieve = {}\n",
   " for i in range(cm.shape[0]):\n",
   "    cls = id_to_type[i]\n",
   "    value = cm[i,i]/np.sum(cm[i,:])\n",
   "    retrieve[cls] = value\n",
   " return retrieve"
   ],
   "execution_count": 191,
   "outputs": []
},
{
   "cell_type": "markdown",
   "metadata": {
   "id": "xgcn0VNgOnJz"
   },
   "source": [
   "## Creating Dataframe in Pandas format"
   ]
},
{
   "cell_type": "code",
   "metadata": {
   "id": "sLGp8eZaWAjr",
   "colab": {
      "base_uri": "https://localhost:8080/"
   },
   "outputId": "b03e6028-950f-4941-ad96-f3416727310b"
   },
   "source": [
   "df, category_id_df = create_dataframe()\n",
   "category_to_id = dict(category_id_df.values)\n",
   "id_to_category = dict(category_id_df[['category_id', 'Category']].values)\n",
   "id_to_category"
   ],
   "execution_count": 192,
   "outputs": [
   {
      "output_type": "execute_result",
      "data": {
         "text/plain": [
         "{0: 'business', 1: 'tech', 2: 'politics', 3: 'sport', 4: 'entertainment'}"
         ]
      },
      "metadata": {
         "tags": []
      },
      "execution_count": 192
   }
   ]
},

1.[Analytical question] Consider two Normally distributed random va...