{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "0163b78b-7a23-49cb-845e-da1315373e77",
   "metadata": {
    "id": "9nkDv5dppU6B"
   },
   "source": [
    "# HEX algorithm **Kopuru Vespa Velutina Competition**\n",
    "\n",
    "**XGBoost model**\n",
    "\n",
    "Purpose: Predict the number of Nests in each of Biscay's 112 municipalities for the year 2020.\n",
    "\n",
    "Output: *(WaspBusters_20210609_batch_XGBy_48019prodigal.csv)*\n",
    "\n",
    "@authors:\n",
    "* mario.bejar@student.ie.edu\n",
    "* pedro.geirinhas@student.ie.edu\n",
    "* a.berrizbeitia@student.ie.edu\n",
    "* pcasaverde@student.ie.edu"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d6c7526e-815c-403e-b586-a33c34e3a9cd",
   "metadata": {},
   "source": [
    "## Libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "1f8be8fd-d42c-4992-a290-acfa2733c1e4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Base packages -----------------------------------\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "# Visualization -----------------------------------\n",
    "import matplotlib.pyplot as plt\n",
    "plt.rcParams[\"figure.figsize\"] = (15, 10)\n",
    "import seaborn as sns\n",
    "plt.style.use(\"seaborn-notebook\")\n",
    "\n",
    "# Scaling data ------------------------------------\n",
    "from sklearn import preprocessing\n",
    "\n",
    "# Grid search -------------------------------------\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "\n",
    "# Confusion matrix --------------------------------\n",
    "#from sklearn.metrics import confusion_matrix\n",
    "from sklearn.metrics import classification_report\n",
    "\n",
    "# XGBoost -----------------------------------------\n",
    "from xgboost import XGBRegressor\n",
    "from xgboost import plot_importance"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e5c4c881-bd4d-4bb0-9f53-c07586957608",
   "metadata": {},
   "source": [
    "## Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "09fb9ce5-1163-440f-945c-5de00fd9f079",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function that checks if final Output is ready for submission or needs revision   \n",
    "\n",
    "def check_data(HEX):\n",
    "    \n",
    "    def template_checker(HEX):\n",
    "        submission_df = (HEX[\"CODIGO MUNICIPIO\"].astype(\"string\") + HEX[\"NOMBRE MUNICIPIO\"]).sort_values().reset_index(drop=True)\n",
    "        template_df = (template[\"CODIGO MUNICIPIO\"].astype(\"string\") + template[\"NOMBRE MUNICIPIO\"]).sort_values().reset_index(drop=True)\n",
    "        check_df = pd.DataFrame({\"submission_df\":submission_df,\"template_df\":template_df})\n",
    "        check_df[\"check\"] = check_df.submission_df == check_df.template_df\n",
    "        if (check_df.check == False).any():\n",
    "            pd.options.display.max_rows = 112\n",
    "            return check_df.loc[check_df.check == False,:]\n",
    "        else:  \n",
    "            return \"All Municipality Names and Codes to be submitted match the Template\"\n",
    "    \n",
    "    print(\"Submission form Shape is\", HEX.shape)\n",
    "    print(\"Number of Municipalities is\", HEX[\"CODIGO MUNICIPIO\"].nunique())\n",
    "    print(\"The Total 2020 Nests' Prediction is\", int(HEX[\"NIDOS 2020\"].sum()))\n",
    "\n",
    "    assert HEX.shape == (112, 3), \"Error: Shape is incorrect.\"\n",
    "    assert HEX[\"CODIGO MUNICIPIO\"].nunique() == 112, \"Error: Number of unique municipalities is correct.\"    \n",
    "    return template_checker(HEX)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ccb2292a-2866-451b-95b6-6150b4b76f09",
   "metadata": {},
   "source": [
    "## Get the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "e1bc52db-6695-493c-b5d8-87618df04690",
   "metadata": {},
   "outputs": [],
   "source": [
    "QUEEN_train = pd.read_csv('../Feeder_months/WBds03_QUEENtrainMONTHS.csv', sep=',')\n",
    "QUEEN_predict = pd.read_csv('../Feeder_months/WBds03_QUEENpredictMONTHS.csv', sep=',')\n",
    "\n",
    "clustersMario = pd.read_csv(\"../auxiliary_files/WBds_CLUSTERSnests.csv\")\n",
    "\n",
    "template = pd.read_csv(\"../../../Input_open_data/ds01_PLANTILLA-RETO-AVISPAS-KOPURU.csv\",sep=\";\", encoding=\"utf-8\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "8f42f8f7-3c3d-402f-b7a3-5d073b3c9315",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "#QUEEN_predict.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "09be088b-b37a-4ffc-b492-2eb78560e2eb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2688, 43)"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "QUEEN_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "d9660c01-5ab0-4e8c-af3e-a616bffcd854",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1344, 43)"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "QUEEN_predict.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "294dd4f3-7342-4ecb-97cc-4e61282097d5",
   "metadata": {},
   "source": [
    "### Add in more Clusters (nest amount clusters)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "afac6fe7-d43b-4823-bf50-0adba2181923",
   "metadata": {},
   "outputs": [],
   "source": [
    "QUEEN_train = pd.merge(QUEEN_train, clustersMario, how = 'left', on = ['municip_code', 'municip_name'])\n",
    "QUEEN_predict = pd.merge(QUEEN_predict, clustersMario, how = 'left', on = ['municip_code', 'municip_name'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "bd93f890-27ff-43d8-8574-6d4f156c09ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "QUEEN_train.fillna(4, inplace=True)\n",
    "QUEEN_predict.fillna(4, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "81b4a261-a2b3-4f10-8672-0c9bf63e8f19",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2688, 44)"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "QUEEN_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "445c00d1-f046-40d9-8b30-bea28651d58e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1344, 44)"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "QUEEN_predict.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "07dad17a-8932-428c-ac33-91b31e0cdec0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0    792\n",
       "2.0    492\n",
       "1.0     48\n",
       "4.0     12\n",
       "Name: Cluster, dtype: int64"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "QUEEN_predict.Cluster.value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2b22ede1-691b-445a-8da7-99183147c476",
   "metadata": {},
   "source": [
    "## Get hyperparameters with GridsearchCV using 2018's features (i.e. 2019's nests) as the test year"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "996f2380-d71b-4d2b-bf44-4c7c1dd459d9",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# The target variable\n",
    "hyper_y_train = QUEEN_train.loc[QUEEN_train.year_offset.isin([2017]), ['municip_code', 'year_offset', 'month', 'NESTS']]\n",
    "hyper_y_train = hyper_y_train.sort_values(by=['year_offset', 'month', 'municip_code'], ascending=True)\n",
    "hyper_y_train.set_index(['year_offset', 'month', 'municip_code'], inplace=True)\n",
    "\n",
    "hyper_y_test = QUEEN_train.loc[QUEEN_train.year_offset.isin([2018]), ['municip_code', 'year_offset', 'month', 'NESTS']]\n",
    "hyper_y_test = hyper_y_test.sort_values(by=['year_offset', 'month', 'municip_code'], ascending=True)\n",
    "hyper_y_test.set_index(['year_offset', 'month', 'municip_code'], inplace=True)\n",
    "\n",
    "\n",
    "# The features matrix\n",
    "hyperXtrain = QUEEN_train.loc[QUEEN_train.year_offset.isin([2017]), :].drop(['municip_name', 'station_code', 'station_name', 'NESTS'], axis=1)\n",
    "hyperXtrain = hyperXtrain.sort_values(by=['year_offset', 'month', 'municip_code'], ascending=True)\n",
    "hyperXtrain.set_index(['year_offset', 'month', 'municip_code'], inplace=True)\n",
    "\n",
    "hyperXtest = QUEEN_train.loc[QUEEN_train.year_offset.isin([2018]), :].drop(['municip_name', 'station_code', 'station_name', 'NESTS'], axis=1)\n",
    "hyperXtest = hyperXtest.sort_values(by=['year_offset', 'month', 'municip_code'], ascending=True)\n",
    "hyperXtest.set_index(['year_offset', 'month', 'municip_code'], inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "5b468f67-247d-481e-8c8d-1ff9be0965cf",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting 3 folds for each of 9 candidates, totalling 27 fits\n",
      "[04:07:39] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/objective/regression_obj.cu:171: reg:linear is now deprecated in favor of reg:squarederror.\n",
      "[04:07:39] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:573: \n",
      "Parameters: { \"silent\" } might not be used.\n",
      "\n",
      "  This may not be accurate due to some parameters are only used in language bindings but\n",
      "  passed down to XGBoost core.  Or some parameters are not used but slip through this\n",
      "  verification. Please open an issue if you find above cases.\n",
      "\n",
      "\n",
      "-0.8732560531236078\n"
     ]
    }
   ],
   "source": [
    "xgb1 = XGBRegressor(random_state=23)\n",
    "parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower\n",
    "              'objective':['reg:linear'],\n",
    "              'learning_rate': [.03, 0.05, .07], #so called `eta` value\n",
    "              'max_depth': [5, 6, 7],\n",
    "             'min_child_weight': [4],\n",
    "              'silent': [1],\n",
    "              'subsample': [0.7],\n",
    "              'colsample_bytree': [0.7],\n",
    "              'n_estimators': [500]}\n",
    "\n",
    "xgb_grid = GridSearchCV(xgb1,\n",
    "                        parameters,\n",
    "                        cv = 3,\n",
    "                        n_jobs = 5,\n",
    "                        verbose=True)\n",
    "\n",
    "xgb_grid.fit(hyperXtrain, hyper_y_train)\n",
    "\n",
    "print(xgb_grid.best_score_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "215120b2-58f6-4974-a59a-7005fb80e8eb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 5, 'min_child_weight': 4, 'n_estimators': 500, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.7}\n"
     ]
    }
   ],
   "source": [
    "print(xgb_grid.best_params_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "id": "56e4a20d-85d7-4304-8760-e45b9a13c52f",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_xgb_grid = xgb_grid.best_estimator_.predict(hyperXtest)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "id": "c3c47027-ec43-479a-9818-c4295550a5ca",
   "metadata": {},
   "outputs": [],
   "source": [
    "#matrix = confusion_matrix(hyper_y_test, y_xgb_grid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "id": "18336fdc-6db0-4217-a7be-99260c147012",
   "metadata": {},
   "outputs": [],
   "source": [
    "#ax = sns.heatmap(\n",
    "#    matrix.T, square=True, annot=True, fmt=\"d\", cbar=False, cmap=\"viridis\",\n",
    "#    xticklabels=[\"0\", \"1\"], yticklabels=[\"0\", \"1\"]\n",
    "#)\n",
    "#ax.set_xlabel(\"True label\")\n",
    "#ax.set_ylabel(\"Predicted label\");"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "id": "71d8bfad-e0dd-4805-b1fc-ad9c8d1f0413",
   "metadata": {},
   "outputs": [],
   "source": [
    "#print(classification_report(hyper_y_test, y_xgb_grid))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "02e47661-dd8c-4228-b35f-ec14059acf11",
   "metadata": {},
   "source": [
    "## Prediction time!"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "21e88d86-2520-4b11-8d79-bb6e7d1ceb1f",
   "metadata": {},
   "source": [
    "### 1. Choose the model class"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "id": "b0dcb655-54cb-412b-958e-45d96f1f4f50",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "xgboost.sklearn.XGBRegressor"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "XGBRegressor"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "376df578-0e36-4453-9cb9-ad0b73a9123c",
   "metadata": {},
   "source": [
    "### 2. Instantiate the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "9c64b189-11cf-49c1-96cd-aa127509e6d4",
   "metadata": {},
   "outputs": [],
   "source": [
    "xgb = xgb_grid.best_estimator_"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9c74df2a-c0d9-4afc-8cf1-56405fc6a75c",
   "metadata": {},
   "source": [
    "### 3. Prepare Feature matrix and Target variable"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "b6276909-8714-4b2d-aeb6-1579563a3cfa",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# The target variable\n",
    "y_train = QUEEN_train.loc[:, ['municip_code', 'year_offset', 'month', 'NESTS']]\n",
    "#y_train = y_train.sort_values(by=['year_offset', 'month', 'municip_code'], ascending=True)\n",
    "y_train.set_index(['year_offset', 'month', 'municip_code'], inplace=True)\n",
    "\n",
    "y_predict = QUEEN_predict.loc[:, ['municip_code', 'year_offset', 'month', 'NESTS']]\n",
    "#y_predict = y_predict.sort_values(by=['year_offset', 'month', 'municip_code'], ascending=True)\n",
    "y_predict.set_index(['year_offset', 'month', 'municip_code'], inplace=True)\n",
    "\n",
    "# The features matrix\n",
    "X_train = QUEEN_train.drop(['municip_name', 'station_code', 'station_name', 'NESTS'], axis=1)\n",
    "#X_train = X_train.sort_values(by=['year_offset', 'month', 'municip_code'], ascending=True)\n",
    "X_train.set_index(['year_offset', 'month', 'municip_code'], inplace=True)\n",
    "\n",
    "X_predict = QUEEN_predict.drop(['municip_name', 'station_code', 'station_name', 'NESTS'], axis=1)\n",
    "#X_predict = X_predict.sort_values(by=['year_offset', 'month', 'municip_code'], ascending=True)\n",
    "X_predict.set_index(['year_offset', 'month', 'municip_code'], inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "id": "f84134d0-f089-45c0-855e-ddaebb0414e3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2688, 37)"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "id": "14b502af-e66e-4e9b-af81-713cef08800e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2688, 1)"
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "id": "08d3fe91-ecc3-4a6b-aa66-bf4763d597db",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1344, 37)"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_predict.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "id": "38e83bd4-3302-41e2-b78f-ea017c632fc6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1344, 1)"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_predict.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cf56ca71-4efa-45e2-9b46-dfd27a9678e6",
   "metadata": {},
   "source": [
    "### 4. Fit the model to the training data sets"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c1baa614-e0d6-4a29-ba6a-ce7243deb938",
   "metadata": {},
   "source": [
    "#### Scale and get feature importance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "id": "a5e29e1e-d356-4118-b1bd-9c1bd4fdcb6f",
   "metadata": {},
   "outputs": [],
   "source": [
    "#X = X_train\n",
    "#y = y_train\n",
    "#scalators = X.columns\n",
    "#X[scalators] = preprocessing.minmax_scale(X[scalators])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "id": "afd7d13b-9349-4764-abf9-6792e748f80f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# define the model\n",
    "#model_fi = XGBRegressor(random_state=23)\n",
    "\n",
    "# fit the model\n",
    "#model_fi.fit(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "id": "2173c355-cab3-4ba7-a1eb-9c179fdf23e3",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# get importance\n",
    "#importance = model_fi.feature_importances_\n",
    "# summarize feature importance\n",
    "#for i,v in enumerate(importance):\n",
    "#\tprint('Feature: %0d, Score: %.5f' % (i,v))\n",
    "# plot feature importance\n",
    "#plot_importance(model_fi, height=0.5, xlabel=\"F-Score\", ylabel=\"Feature Importance\", grid=False)\n",
    "#plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "43eec2b5-0ba6-4b93-aff2-3e64cca0c2a7",
   "metadata": {},
   "source": [
    "#### Now, do fit the model but only with the relevant features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "id": "fe675993-0bdd-420e-b4e5-c03b09a8c4f2",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = X_train.loc[:, ['population', 'weath_humidity', 'food_fruit', 'weath_maxLevel', 'food_txakoli', 'weath_midLevel', 'weath_minLevel', 'colonies_amount', 'weath_maxWindM', 'weath_meanWindM', 'weath_accuRainfall', 'weath_10minRainfall', 'food_kiwi', 'food_apple', 'weath_days_rain1mm', 'weath_meanDayMaxWind', 'weath_meanTemp']]\n",
    "\n",
    "X_predict = X_predict.loc[:, ['population', 'weath_humidity', 'food_fruit', 'weath_maxLevel', 'food_txakoli', 'weath_midLevel', 'weath_minLevel', 'colonies_amount', 'weath_maxWindM', 'weath_meanWindM', 'weath_accuRainfall', 'weath_10minRainfall', 'food_kiwi', 'food_apple', 'weath_days_rain1mm', 'weath_meanDayMaxWind', 'weath_meanTemp']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "id": "3dd1d415-6a4e-4756-bb7f-cce3832a42f1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[04:07:40] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/objective/regression_obj.cu:171: reg:linear is now deprecated in favor of reg:squarederror.\n",
      "[04:07:40] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:573: \n",
      "Parameters: { \"silent\" } might not be used.\n",
      "\n",
      "  This may not be accurate due to some parameters are only used in language bindings but\n",
      "  passed down to XGBoost core.  Or some parameters are not used but slip through this\n",
      "  verification. Please open an issue if you find above cases.\n",
      "\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n",
       "             colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,\n",
       "             importance_type='gain', interaction_constraints='',\n",
       "             learning_rate=0.03, max_delta_step=0, max_depth=5,\n",
       "             min_child_weight=4, missing=nan, monotone_constraints='()',\n",
       "             n_estimators=500, n_jobs=4, nthread=4, num_parallel_tree=1,\n",
       "             objective='reg:linear', random_state=23, reg_alpha=0, reg_lambda=1,\n",
       "             scale_pos_weight=1, silent=1, subsample=0.7, tree_method='exact',\n",
       "             validate_parameters=1, verbosity=None)"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "xgb.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8512cec4-74d9-4e4c-8175-9e7797e16192",
   "metadata": {},
   "source": [
    "### 5. Predict the labels for new data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "id": "dbb7a4d0-3df4-4c89-b30d-f8fdb9e4566b",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_predict = xgb.predict(X_predict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "id": "c15d9b7e-f072-4df7-b65e-c624997bbcd5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy on the training set: 88%\n"
     ]
    }
   ],
   "source": [
    "accuracy_train = xgb.score(X_train, y_train)\n",
    "print(f\"Accuracy on the training set: {accuracy_train:.0%}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "id": "21ebb5c9-7a67-4710-9167-7ba8b4732c8a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy on the test set: 100%\n"
     ]
    }
   ],
   "source": [
    "accuracy_predict = xgb.score(X_predict, y_predict)\n",
    "print(f\"Accuracy on the test set: {accuracy_predict:.0%}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "id": "1d73fcda-4959-4988-a8cd-ba100f8603d2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1344,)"
      ]
     },
     "execution_count": 84,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_predict.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "id": "a2b34068-a080-4093-8ac4-6ae64c512390",
   "metadata": {},
   "outputs": [],
   "source": [
    "QUEEN_predict['NESTS'] = y_predict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "id": "5e13172a-ae79-406c-a1ab-129ed6434389",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2904.2285"
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "QUEEN_predict.NESTS.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "id": "54e7287b-3467-4186-8d7f-9b45bf793c59",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-87-ef4b3a26f94e>:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  QUEEN_predict.NESTS[QUEEN_predict.NESTS < 0] = 0\n"
     ]
    }
   ],
   "source": [
    "QUEEN_predict.NESTS[QUEEN_predict.NESTS < 0] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "id": "7cfbaa57-8cf2-4306-8cbd-f0d2b15f442e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2909.1338"
      ]
     },
     "execution_count": 88,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "QUEEN_predict.NESTS.sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3423a8da-0a1c-48f2-813e-44d2988cb52a",
   "metadata": {},
   "source": [
    "## Prepare the dataset for submission"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "id": "b89c9e5d-6c6c-450f-bae6-deee72f0d6a6",
   "metadata": {},
   "outputs": [],
   "source": [
    "HEX = QUEEN_predict.loc[:,['municip_code', 'municip_name', 'NESTS']].groupby(by=['municip_code', 'municip_name'], as_index=False).sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f0a437ca-e975-42ab-b15a-1ee8ced3bdd0",
   "metadata": {},
   "source": [
    "## Adjust manually for Bilbao 48020 and generate the output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "id": "993847dc-43b2-412f-9746-02b2be1c805a",
   "metadata": {},
   "outputs": [],
   "source": [
    "HEX.loc[HEX.municip_code.isin([48020]), 'NESTS'] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "id": "8fc877e4-5616-4c3a-9af5-87a7de17b1dc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>municip_code</th>\n",
       "      <th>municip_name</th>\n",
       "      <th>NESTS</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>48020</td>\n",
       "      <td>Bilbao</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>48022</td>\n",
       "      <td>Karrantza Harana/Valle de Carranza</td>\n",
       "      <td>20.604404</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50</th>\n",
       "      <td>48051</td>\n",
       "      <td>Lanestosa</td>\n",
       "      <td>5.878829</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70</th>\n",
       "      <td>48071</td>\n",
       "      <td>Muskiz</td>\n",
       "      <td>22.939783</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73</th>\n",
       "      <td>48074</td>\n",
       "      <td>Urduña/Orduña</td>\n",
       "      <td>22.237362</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>87</th>\n",
       "      <td>48088</td>\n",
       "      <td>Ubide</td>\n",
       "      <td>4.863704</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    municip_code                        municip_name      NESTS\n",
       "19         48020                              Bilbao   0.000000\n",
       "21         48022  Karrantza Harana/Valle de Carranza  20.604404\n",
       "50         48051                           Lanestosa   5.878829\n",
       "70         48071                              Muskiz  22.939783\n",
       "73         48074                       Urduña/Orduña  22.237362\n",
       "87         48088                               Ubide   4.863704"
      ]
     },
     "execution_count": 91,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "HEX.loc[HEX.municip_code.isin([48022, 48071, 48088, 48074, 48051, 48020]), :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "id": "24305092-c891-4e1e-91cb-0a23090e3f67",
   "metadata": {},
   "outputs": [],
   "source": [
    "HEX.columns = [\"CODIGO MUNICIPIO\", \"NOMBRE MUNICIPIO\", \"NIDOS 2020\"] # change column names to Spanish (Competition template)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "id": "bc3b17dd-e71a-427e-9525-051c70128969",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Submission form Shape is (112, 3)\n",
      "Number of Municipalities is 112\n",
      "The Total 2020 Nests' Prediction is 2900\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'All Municipality Names and Codes to be submitted match the Template'"
      ]
     },
     "execution_count": 93,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "check_data(HEX)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "25535ba3-2b88-4461-9503-84f3606f13ca",
   "metadata": {},
   "source": [
    "### Export dataset for submission"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "id": "f209395d-f4f9-47ba-8bf1-a3b73fb8ed90",
   "metadata": {},
   "outputs": [],
   "source": [
    "HEX.to_csv('WaspBusters_20210609_136-mXGB-prodigal-GSCV-noSort-FI-no0s.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ab29f668-6aeb-40e3-8a3d-1a999f9aae2e",
   "metadata": {},
   "source": [
    "## VERSION Manual adjustments"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "id": "fff13fc8-2c9b-4b21-89ee-8b114283c276",
   "metadata": {},
   "outputs": [],
   "source": [
    "HEX.columns = ['municip_code', 'municip_name', 'NESTS'] # change column names to Spanish (Competition template)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "id": "b86a00c3-d839-4a5d-931f-9e65f77ed6a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "HEX.loc[HEX.municip_code.isin([48022, 48071, 48088, 48074, 48051]), 'NESTS'] = [0,0,1,0,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "id": "32a429bc-4dcf-4cec-9b57-9c0b297888c6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>municip_code</th>\n",
       "      <th>municip_name</th>\n",
       "      <th>NESTS</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>48020</td>\n",
       "      <td>Bilbao</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>48022</td>\n",
       "      <td>Karrantza Harana/Valle de Carranza</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50</th>\n",
       "      <td>48051</td>\n",
       "      <td>Lanestosa</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70</th>\n",
       "      <td>48071</td>\n",
       "      <td>Muskiz</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73</th>\n",
       "      <td>48074</td>\n",
       "      <td>Urduña/Orduña</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>87</th>\n",
       "      <td>48088</td>\n",
       "      <td>Ubide</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    municip_code                        municip_name  NESTS\n",
       "19         48020                              Bilbao    0.0\n",
       "21         48022  Karrantza Harana/Valle de Carranza    0.0\n",
       "50         48051                           Lanestosa    0.0\n",
       "70         48071                              Muskiz    1.0\n",
       "73         48074                       Urduña/Orduña    0.0\n",
       "87         48088                               Ubide    1.0"
      ]
     },
     "execution_count": 97,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "HEX.loc[HEX.municip_code.isin([48022, 48071, 48088, 48074, 48051, 48020]), :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "id": "71e6b3a6-fc2b-4555-be00-ef7cef9c5789",
   "metadata": {},
   "outputs": [],
   "source": [
    "HEX.columns = [\"CODIGO MUNICIPIO\", \"NOMBRE MUNICIPIO\", \"NIDOS 2020\"] # change column names to Spanish (Competition template)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "id": "59b95188-bbb5-45c2-9ed3-14bf93807813",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Submission form Shape is (112, 3)\n",
      "Number of Municipalities is 112\n",
      "The Total 2020 Nests' Prediction is 2826\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'All Municipality Names and Codes to be submitted match the Template'"
      ]
     },
     "execution_count": 99,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "check_data(HEX)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cc759394-8282-496d-9901-090c58e165e2",
   "metadata": {},
   "source": [
    "### Export dataset for submission"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "id": "22a4eacd-46a5-45d6-a4de-bc3fe235b01a",
   "metadata": {},
   "outputs": [],
   "source": [
    "HEX.to_csv('WaspBusters_20210609_135-mXGB-prodigal-GSCV-noSort-FI-0s.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}