{ "cells": [ { "cell_type": "markdown", "id": "0163b78b-7a23-49cb-845e-da1315373e77", "metadata": { "id": "9nkDv5dppU6B" }, "source": [ "# HEX algorithm **Kopuru Vespa Velutina Competition**\n", "\n", "**XGBoost model**\n", "\n", "Purpose: Predict the number of Nests in each of Biscay's 112 municipalities for the year 2020.\n", "\n", "Output: *(WaspBusters_20210609_batch_XGBy_48019prodigal.csv)*\n", "\n", "@authors:\n", "* mario.bejar@student.ie.edu\n", "* pedro.geirinhas@student.ie.edu\n", "* a.berrizbeitia@student.ie.edu\n", "* pcasaverde@student.ie.edu" ] }, { "cell_type": "markdown", "id": "d6c7526e-815c-403e-b586-a33c34e3a9cd", "metadata": {}, "source": [ "## Libraries" ] }, { "cell_type": "code", "execution_count": 51, "id": "1f8be8fd-d42c-4992-a290-acfa2733c1e4", "metadata": {}, "outputs": [], "source": [ "# Base packages -----------------------------------\n", "import numpy as np\n", "import pandas as pd\n", "\n", "# Visualization -----------------------------------\n", "import matplotlib.pyplot as plt\n", "plt.rcParams[\"figure.figsize\"] = (15, 10)\n", "import seaborn as sns\n", "plt.style.use(\"seaborn-notebook\")\n", "\n", "# Scaling data ------------------------------------\n", "from sklearn import preprocessing\n", "\n", "# Grid search -------------------------------------\n", "from sklearn.model_selection import GridSearchCV\n", "\n", "# Confusion matrix --------------------------------\n", "#from sklearn.metrics import confusion_matrix\n", "from sklearn.metrics import classification_report\n", "\n", "# XGBoost -----------------------------------------\n", "from xgboost import XGBRegressor\n", "from xgboost import plot_importance" ] }, { "cell_type": "markdown", "id": "e5c4c881-bd4d-4bb0-9f53-c07586957608", "metadata": {}, "source": [ "## Functions" ] }, { "cell_type": "code", "execution_count": 52, "id": "09fb9ce5-1163-440f-945c-5de00fd9f079", "metadata": {}, "outputs": [], "source": [ "# Function that checks if final Output is ready for submission or needs revision \n", "\n", "def check_data(HEX):\n", " \n", " def template_checker(HEX):\n", " submission_df = (HEX[\"CODIGO MUNICIPIO\"].astype(\"string\") + HEX[\"NOMBRE MUNICIPIO\"]).sort_values().reset_index(drop=True)\n", " template_df = (template[\"CODIGO MUNICIPIO\"].astype(\"string\") + template[\"NOMBRE MUNICIPIO\"]).sort_values().reset_index(drop=True)\n", " check_df = pd.DataFrame({\"submission_df\":submission_df,\"template_df\":template_df})\n", " check_df[\"check\"] = check_df.submission_df == check_df.template_df\n", " if (check_df.check == False).any():\n", " pd.options.display.max_rows = 112\n", " return check_df.loc[check_df.check == False,:]\n", " else: \n", " return \"All Municipality Names and Codes to be submitted match the Template\"\n", " \n", " print(\"Submission form Shape is\", HEX.shape)\n", " print(\"Number of Municipalities is\", HEX[\"CODIGO MUNICIPIO\"].nunique())\n", " print(\"The Total 2020 Nests' Prediction is\", int(HEX[\"NIDOS 2020\"].sum()))\n", "\n", " assert HEX.shape == (112, 3), \"Error: Shape is incorrect.\"\n", " assert HEX[\"CODIGO MUNICIPIO\"].nunique() == 112, \"Error: Number of unique municipalities is correct.\" \n", " return template_checker(HEX)" ] }, { "cell_type": "markdown", "id": "ccb2292a-2866-451b-95b6-6150b4b76f09", "metadata": {}, "source": [ "## Get the data" ] }, { "cell_type": "code", "execution_count": 53, "id": "e1bc52db-6695-493c-b5d8-87618df04690", "metadata": {}, "outputs": [], "source": [ "QUEEN_train = pd.read_csv('../Feeder_months/WBds03_QUEENtrainMONTHS.csv', sep=',')\n", "QUEEN_predict = pd.read_csv('../Feeder_months/WBds03_QUEENpredictMONTHS.csv', sep=',')\n", "\n", "clustersMario = pd.read_csv(\"../auxiliary_files/WBds_CLUSTERSnests.csv\")\n", "\n", "template = pd.read_csv(\"../../../Input_open_data/ds01_PLANTILLA-RETO-AVISPAS-KOPURU.csv\",sep=\";\", encoding=\"utf-8\")" ] }, { "cell_type": "code", "execution_count": 54, "id": "8f42f8f7-3c3d-402f-b7a3-5d073b3c9315", "metadata": { "tags": [] }, "outputs": [], "source": [ "#QUEEN_predict.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 55, "id": "09be088b-b37a-4ffc-b492-2eb78560e2eb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(2688, 43)" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "QUEEN_train.shape" ] }, { "cell_type": "code", "execution_count": 56, "id": "d9660c01-5ab0-4e8c-af3e-a616bffcd854", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1344, 43)" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "QUEEN_predict.shape" ] }, { "cell_type": "markdown", "id": "294dd4f3-7342-4ecb-97cc-4e61282097d5", "metadata": {}, "source": [ "### Add in more Clusters (nest amount clusters)" ] }, { "cell_type": "code", "execution_count": 57, "id": "afac6fe7-d43b-4823-bf50-0adba2181923", "metadata": {}, "outputs": [], "source": [ "QUEEN_train = pd.merge(QUEEN_train, clustersMario, how = 'left', on = ['municip_code', 'municip_name'])\n", "QUEEN_predict = pd.merge(QUEEN_predict, clustersMario, how = 'left', on = ['municip_code', 'municip_name'])" ] }, { "cell_type": "code", "execution_count": 58, "id": "bd93f890-27ff-43d8-8574-6d4f156c09ac", "metadata": {}, "outputs": [], "source": [ "QUEEN_train.fillna(4, inplace=True)\n", "QUEEN_predict.fillna(4, inplace=True)" ] }, { "cell_type": "code", "execution_count": 59, "id": "81b4a261-a2b3-4f10-8672-0c9bf63e8f19", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(2688, 44)" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "QUEEN_train.shape" ] }, { "cell_type": "code", "execution_count": 60, "id": "445c00d1-f046-40d9-8b30-bea28651d58e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1344, 44)" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "QUEEN_predict.shape" ] }, { "cell_type": "code", "execution_count": 61, "id": "07dad17a-8932-428c-ac33-91b31e0cdec0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.0 792\n", "2.0 492\n", "1.0 48\n", "4.0 12\n", "Name: Cluster, dtype: int64" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "QUEEN_predict.Cluster.value_counts()" ] }, { "cell_type": "markdown", "id": "2b22ede1-691b-445a-8da7-99183147c476", "metadata": {}, "source": [ "## Get hyperparameters with GridsearchCV using 2018's features (i.e. 2019's nests) as the test year" ] }, { "cell_type": "code", "execution_count": 62, "id": "996f2380-d71b-4d2b-bf44-4c7c1dd459d9", "metadata": { "tags": [] }, "outputs": [], "source": [ "# The target variable\n", "hyper_y_train = QUEEN_train.loc[QUEEN_train.year_offset.isin([2017]), ['municip_code', 'year_offset', 'month', 'NESTS']]\n", "hyper_y_train = hyper_y_train.sort_values(by=['year_offset', 'month', 'municip_code'], ascending=True)\n", "hyper_y_train.set_index(['year_offset', 'month', 'municip_code'], inplace=True)\n", "\n", "hyper_y_test = QUEEN_train.loc[QUEEN_train.year_offset.isin([2018]), ['municip_code', 'year_offset', 'month', 'NESTS']]\n", "hyper_y_test = hyper_y_test.sort_values(by=['year_offset', 'month', 'municip_code'], ascending=True)\n", "hyper_y_test.set_index(['year_offset', 'month', 'municip_code'], inplace=True)\n", "\n", "\n", "# The features matrix\n", "hyperXtrain = QUEEN_train.loc[QUEEN_train.year_offset.isin([2017]), :].drop(['municip_name', 'station_code', 'station_name', 'NESTS'], axis=1)\n", "hyperXtrain = hyperXtrain.sort_values(by=['year_offset', 'month', 'municip_code'], ascending=True)\n", "hyperXtrain.set_index(['year_offset', 'month', 'municip_code'], inplace=True)\n", "\n", "hyperXtest = QUEEN_train.loc[QUEEN_train.year_offset.isin([2018]), :].drop(['municip_name', 'station_code', 'station_name', 'NESTS'], axis=1)\n", "hyperXtest = hyperXtest.sort_values(by=['year_offset', 'month', 'municip_code'], ascending=True)\n", "hyperXtest.set_index(['year_offset', 'month', 'municip_code'], inplace=True)" ] }, { "cell_type": "code", "execution_count": 63, "id": "5b468f67-247d-481e-8c8d-1ff9be0965cf", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fitting 3 folds for each of 9 candidates, totalling 27 fits\n", "[04:07:39] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/objective/regression_obj.cu:171: reg:linear is now deprecated in favor of reg:squarederror.\n", "[04:07:39] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:573: \n", "Parameters: { \"silent\" } might not be used.\n", "\n", " This may not be accurate due to some parameters are only used in language bindings but\n", " passed down to XGBoost core. Or some parameters are not used but slip through this\n", " verification. Please open an issue if you find above cases.\n", "\n", "\n", "-0.8732560531236078\n" ] } ], "source": [ "xgb1 = XGBRegressor(random_state=23)\n", "parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower\n", " 'objective':['reg:linear'],\n", " 'learning_rate': [.03, 0.05, .07], #so called `eta` value\n", " 'max_depth': [5, 6, 7],\n", " 'min_child_weight': [4],\n", " 'silent': [1],\n", " 'subsample': [0.7],\n", " 'colsample_bytree': [0.7],\n", " 'n_estimators': [500]}\n", "\n", "xgb_grid = GridSearchCV(xgb1,\n", " parameters,\n", " cv = 3,\n", " n_jobs = 5,\n", " verbose=True)\n", "\n", "xgb_grid.fit(hyperXtrain, hyper_y_train)\n", "\n", "print(xgb_grid.best_score_)" ] }, { "cell_type": "code", "execution_count": 64, "id": "215120b2-58f6-4974-a59a-7005fb80e8eb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 5, 'min_child_weight': 4, 'n_estimators': 500, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.7}\n" ] } ], "source": [ "print(xgb_grid.best_params_)" ] }, { "cell_type": "code", "execution_count": 65, "id": "56e4a20d-85d7-4304-8760-e45b9a13c52f", "metadata": {}, "outputs": [], "source": [ "y_xgb_grid = xgb_grid.best_estimator_.predict(hyperXtest)" ] }, { "cell_type": "code", "execution_count": 66, "id": "c3c47027-ec43-479a-9818-c4295550a5ca", "metadata": {}, "outputs": [], "source": [ "#matrix = confusion_matrix(hyper_y_test, y_xgb_grid)" ] }, { "cell_type": "code", "execution_count": 67, "id": "18336fdc-6db0-4217-a7be-99260c147012", "metadata": {}, "outputs": [], "source": [ "#ax = sns.heatmap(\n", "# matrix.T, square=True, annot=True, fmt=\"d\", cbar=False, cmap=\"viridis\",\n", "# xticklabels=[\"0\", \"1\"], yticklabels=[\"0\", \"1\"]\n", "#)\n", "#ax.set_xlabel(\"True label\")\n", "#ax.set_ylabel(\"Predicted label\");" ] }, { "cell_type": "code", "execution_count": 68, "id": "71d8bfad-e0dd-4805-b1fc-ad9c8d1f0413", "metadata": {}, "outputs": [], "source": [ "#print(classification_report(hyper_y_test, y_xgb_grid))" ] }, { "cell_type": "markdown", "id": "02e47661-dd8c-4228-b35f-ec14059acf11", "metadata": {}, "source": [ "## Prediction time!" ] }, { "cell_type": "markdown", "id": "21e88d86-2520-4b11-8d79-bb6e7d1ceb1f", "metadata": {}, "source": [ "### 1. Choose the model class" ] }, { "cell_type": "code", "execution_count": 69, "id": "b0dcb655-54cb-412b-958e-45d96f1f4f50", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "xgboost.sklearn.XGBRegressor" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "XGBRegressor" ] }, { "cell_type": "markdown", "id": "376df578-0e36-4453-9cb9-ad0b73a9123c", "metadata": {}, "source": [ "### 2. Instantiate the model" ] }, { "cell_type": "code", "execution_count": 70, "id": "9c64b189-11cf-49c1-96cd-aa127509e6d4", "metadata": {}, "outputs": [], "source": [ "xgb = xgb_grid.best_estimator_" ] }, { "cell_type": "markdown", "id": "9c74df2a-c0d9-4afc-8cf1-56405fc6a75c", "metadata": {}, "source": [ "### 3. Prepare Feature matrix and Target variable" ] }, { "cell_type": "code", "execution_count": 71, "id": "b6276909-8714-4b2d-aeb6-1579563a3cfa", "metadata": { "tags": [] }, "outputs": [], "source": [ "# The target variable\n", "y_train = QUEEN_train.loc[:, ['municip_code', 'year_offset', 'month', 'NESTS']]\n", "#y_train = y_train.sort_values(by=['year_offset', 'month', 'municip_code'], ascending=True)\n", "y_train.set_index(['year_offset', 'month', 'municip_code'], inplace=True)\n", "\n", "y_predict = QUEEN_predict.loc[:, ['municip_code', 'year_offset', 'month', 'NESTS']]\n", "#y_predict = y_predict.sort_values(by=['year_offset', 'month', 'municip_code'], ascending=True)\n", "y_predict.set_index(['year_offset', 'month', 'municip_code'], inplace=True)\n", "\n", "# The features matrix\n", "X_train = QUEEN_train.drop(['municip_name', 'station_code', 'station_name', 'NESTS'], axis=1)\n", "#X_train = X_train.sort_values(by=['year_offset', 'month', 'municip_code'], ascending=True)\n", "X_train.set_index(['year_offset', 'month', 'municip_code'], inplace=True)\n", "\n", "X_predict = QUEEN_predict.drop(['municip_name', 'station_code', 'station_name', 'NESTS'], axis=1)\n", "#X_predict = X_predict.sort_values(by=['year_offset', 'month', 'municip_code'], ascending=True)\n", "X_predict.set_index(['year_offset', 'month', 'municip_code'], inplace=True)" ] }, { "cell_type": "code", "execution_count": 72, "id": "f84134d0-f089-45c0-855e-ddaebb0414e3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(2688, 37)" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train.shape" ] }, { "cell_type": "code", "execution_count": 73, "id": "14b502af-e66e-4e9b-af81-713cef08800e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(2688, 1)" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_train.shape" ] }, { "cell_type": "code", "execution_count": 74, "id": "08d3fe91-ecc3-4a6b-aa66-bf4763d597db", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1344, 37)" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_predict.shape" ] }, { "cell_type": "code", "execution_count": 75, "id": "38e83bd4-3302-41e2-b78f-ea017c632fc6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1344, 1)" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_predict.shape" ] }, { "cell_type": "markdown", "id": "cf56ca71-4efa-45e2-9b46-dfd27a9678e6", "metadata": {}, "source": [ "### 4. Fit the model to the training data sets" ] }, { "cell_type": "markdown", "id": "c1baa614-e0d6-4a29-ba6a-ce7243deb938", "metadata": {}, "source": [ "#### Scale and get feature importance" ] }, { "cell_type": "code", "execution_count": 76, "id": "a5e29e1e-d356-4118-b1bd-9c1bd4fdcb6f", "metadata": {}, "outputs": [], "source": [ "#X = X_train\n", "#y = y_train\n", "#scalators = X.columns\n", "#X[scalators] = preprocessing.minmax_scale(X[scalators])" ] }, { "cell_type": "code", "execution_count": 77, "id": "afd7d13b-9349-4764-abf9-6792e748f80f", "metadata": {}, "outputs": [], "source": [ "# define the model\n", "#model_fi = XGBRegressor(random_state=23)\n", "\n", "# fit the model\n", "#model_fi.fit(X, y)" ] }, { "cell_type": "code", "execution_count": 78, "id": "2173c355-cab3-4ba7-a1eb-9c179fdf23e3", "metadata": { "tags": [] }, "outputs": [], "source": [ "# get importance\n", "#importance = model_fi.feature_importances_\n", "# summarize feature importance\n", "#for i,v in enumerate(importance):\n", "#\tprint('Feature: %0d, Score: %.5f' % (i,v))\n", "# plot feature importance\n", "#plot_importance(model_fi, height=0.5, xlabel=\"F-Score\", ylabel=\"Feature Importance\", grid=False)\n", "#plt.show()" ] }, { "cell_type": "markdown", "id": "43eec2b5-0ba6-4b93-aff2-3e64cca0c2a7", "metadata": {}, "source": [ "#### Now, do fit the model but only with the relevant features" ] }, { "cell_type": "code", "execution_count": 79, "id": "fe675993-0bdd-420e-b4e5-c03b09a8c4f2", "metadata": {}, "outputs": [], "source": [ "X_train = X_train.loc[:, ['population', 'weath_humidity', 'food_fruit', 'weath_maxLevel', 'food_txakoli', 'weath_midLevel', 'weath_minLevel', 'colonies_amount', 'weath_maxWindM', 'weath_meanWindM', 'weath_accuRainfall', 'weath_10minRainfall', 'food_kiwi', 'food_apple', 'weath_days_rain1mm', 'weath_meanDayMaxWind', 'weath_meanTemp']]\n", "\n", "X_predict = X_predict.loc[:, ['population', 'weath_humidity', 'food_fruit', 'weath_maxLevel', 'food_txakoli', 'weath_midLevel', 'weath_minLevel', 'colonies_amount', 'weath_maxWindM', 'weath_meanWindM', 'weath_accuRainfall', 'weath_10minRainfall', 'food_kiwi', 'food_apple', 'weath_days_rain1mm', 'weath_meanDayMaxWind', 'weath_meanTemp']]" ] }, { "cell_type": "code", "execution_count": 80, "id": "3dd1d415-6a4e-4756-bb7f-cce3832a42f1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[04:07:40] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/objective/regression_obj.cu:171: reg:linear is now deprecated in favor of reg:squarederror.\n", "[04:07:40] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:573: \n", "Parameters: { \"silent\" } might not be used.\n", "\n", " This may not be accurate due to some parameters are only used in language bindings but\n", " passed down to XGBoost core. Or some parameters are not used but slip through this\n", " verification. Please open an issue if you find above cases.\n", "\n", "\n" ] }, { "data": { "text/plain": [ "XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n", " colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,\n", " importance_type='gain', interaction_constraints='',\n", " learning_rate=0.03, max_delta_step=0, max_depth=5,\n", " min_child_weight=4, missing=nan, monotone_constraints='()',\n", " n_estimators=500, n_jobs=4, nthread=4, num_parallel_tree=1,\n", " objective='reg:linear', random_state=23, reg_alpha=0, reg_lambda=1,\n", " scale_pos_weight=1, silent=1, subsample=0.7, tree_method='exact',\n", " validate_parameters=1, verbosity=None)" ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "xgb.fit(X_train, y_train)" ] }, { "cell_type": "markdown", "id": "8512cec4-74d9-4e4c-8175-9e7797e16192", "metadata": {}, "source": [ "### 5. Predict the labels for new data" ] }, { "cell_type": "code", "execution_count": 81, "id": "dbb7a4d0-3df4-4c89-b30d-f8fdb9e4566b", "metadata": {}, "outputs": [], "source": [ "y_predict = xgb.predict(X_predict)" ] }, { "cell_type": "code", "execution_count": 82, "id": "c15d9b7e-f072-4df7-b65e-c624997bbcd5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy on the training set: 88%\n" ] } ], "source": [ "accuracy_train = xgb.score(X_train, y_train)\n", "print(f\"Accuracy on the training set: {accuracy_train:.0%}\")" ] }, { "cell_type": "code", "execution_count": 83, "id": "21ebb5c9-7a67-4710-9167-7ba8b4732c8a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy on the test set: 100%\n" ] } ], "source": [ "accuracy_predict = xgb.score(X_predict, y_predict)\n", "print(f\"Accuracy on the test set: {accuracy_predict:.0%}\")" ] }, { "cell_type": "code", "execution_count": 84, "id": "1d73fcda-4959-4988-a8cd-ba100f8603d2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1344,)" ] }, "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_predict.shape" ] }, { "cell_type": "code", "execution_count": 85, "id": "a2b34068-a080-4093-8ac4-6ae64c512390", "metadata": {}, "outputs": [], "source": [ "QUEEN_predict['NESTS'] = y_predict" ] }, { "cell_type": "code", "execution_count": 86, "id": "5e13172a-ae79-406c-a1ab-129ed6434389", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2904.2285" ] }, "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "QUEEN_predict.NESTS.sum()" ] }, { "cell_type": "code", "execution_count": 87, "id": "54e7287b-3467-4186-8d7f-9b45bf793c59", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ ":1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " QUEEN_predict.NESTS[QUEEN_predict.NESTS < 0] = 0\n" ] } ], "source": [ "QUEEN_predict.NESTS[QUEEN_predict.NESTS < 0] = 0" ] }, { "cell_type": "code", "execution_count": 88, "id": "7cfbaa57-8cf2-4306-8cbd-f0d2b15f442e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2909.1338" ] }, "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "QUEEN_predict.NESTS.sum()" ] }, { "cell_type": "markdown", "id": "3423a8da-0a1c-48f2-813e-44d2988cb52a", "metadata": {}, "source": [ "## Prepare the dataset for submission" ] }, { "cell_type": "code", "execution_count": 89, "id": "b89c9e5d-6c6c-450f-bae6-deee72f0d6a6", "metadata": {}, "outputs": [], "source": [ "HEX = QUEEN_predict.loc[:,['municip_code', 'municip_name', 'NESTS']].groupby(by=['municip_code', 'municip_name'], as_index=False).sum()" ] }, { "cell_type": "markdown", "id": "f0a437ca-e975-42ab-b15a-1ee8ced3bdd0", "metadata": {}, "source": [ "## Adjust manually for Bilbao 48020 and generate the output" ] }, { "cell_type": "code", "execution_count": 90, "id": "993847dc-43b2-412f-9746-02b2be1c805a", "metadata": {}, "outputs": [], "source": [ "HEX.loc[HEX.municip_code.isin([48020]), 'NESTS'] = 0" ] }, { "cell_type": "code", "execution_count": 91, "id": "8fc877e4-5616-4c3a-9af5-87a7de17b1dc", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
municip_codemunicip_nameNESTS
1948020Bilbao0.000000
2148022Karrantza Harana/Valle de Carranza20.604404
5048051Lanestosa5.878829
7048071Muskiz22.939783
7348074Urduña/Orduña22.237362
8748088Ubide4.863704
\n", "
" ], "text/plain": [ " municip_code municip_name NESTS\n", "19 48020 Bilbao 0.000000\n", "21 48022 Karrantza Harana/Valle de Carranza 20.604404\n", "50 48051 Lanestosa 5.878829\n", "70 48071 Muskiz 22.939783\n", "73 48074 Urduña/Orduña 22.237362\n", "87 48088 Ubide 4.863704" ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "HEX.loc[HEX.municip_code.isin([48022, 48071, 48088, 48074, 48051, 48020]), :]" ] }, { "cell_type": "code", "execution_count": 92, "id": "24305092-c891-4e1e-91cb-0a23090e3f67", "metadata": {}, "outputs": [], "source": [ "HEX.columns = [\"CODIGO MUNICIPIO\", \"NOMBRE MUNICIPIO\", \"NIDOS 2020\"] # change column names to Spanish (Competition template)" ] }, { "cell_type": "code", "execution_count": 93, "id": "bc3b17dd-e71a-427e-9525-051c70128969", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Submission form Shape is (112, 3)\n", "Number of Municipalities is 112\n", "The Total 2020 Nests' Prediction is 2900\n" ] }, { "data": { "text/plain": [ "'All Municipality Names and Codes to be submitted match the Template'" ] }, "execution_count": 93, "metadata": {}, "output_type": "execute_result" } ], "source": [ "check_data(HEX)" ] }, { "cell_type": "markdown", "id": "25535ba3-2b88-4461-9503-84f3606f13ca", "metadata": {}, "source": [ "### Export dataset for submission" ] }, { "cell_type": "code", "execution_count": 94, "id": "f209395d-f4f9-47ba-8bf1-a3b73fb8ed90", "metadata": {}, "outputs": [], "source": [ "HEX.to_csv('WaspBusters_20210609_136-mXGB-prodigal-GSCV-noSort-FI-no0s.csv', index=False)" ] }, { "cell_type": "markdown", "id": "ab29f668-6aeb-40e3-8a3d-1a999f9aae2e", "metadata": {}, "source": [ "## VERSION Manual adjustments" ] }, { "cell_type": "code", "execution_count": 95, "id": "fff13fc8-2c9b-4b21-89ee-8b114283c276", "metadata": {}, "outputs": [], "source": [ "HEX.columns = ['municip_code', 'municip_name', 'NESTS'] # change column names to Spanish (Competition template)" ] }, { "cell_type": "code", "execution_count": 96, "id": "b86a00c3-d839-4a5d-931f-9e65f77ed6a1", "metadata": {}, "outputs": [], "source": [ "HEX.loc[HEX.municip_code.isin([48022, 48071, 48088, 48074, 48051]), 'NESTS'] = [0,0,1,0,1]" ] }, { "cell_type": "code", "execution_count": 97, "id": "32a429bc-4dcf-4cec-9b57-9c0b297888c6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
municip_codemunicip_nameNESTS
1948020Bilbao0.0
2148022Karrantza Harana/Valle de Carranza0.0
5048051Lanestosa0.0
7048071Muskiz1.0
7348074Urduña/Orduña0.0
8748088Ubide1.0
\n", "
" ], "text/plain": [ " municip_code municip_name NESTS\n", "19 48020 Bilbao 0.0\n", "21 48022 Karrantza Harana/Valle de Carranza 0.0\n", "50 48051 Lanestosa 0.0\n", "70 48071 Muskiz 1.0\n", "73 48074 Urduña/Orduña 0.0\n", "87 48088 Ubide 1.0" ] }, "execution_count": 97, "metadata": {}, "output_type": "execute_result" } ], "source": [ "HEX.loc[HEX.municip_code.isin([48022, 48071, 48088, 48074, 48051, 48020]), :]" ] }, { "cell_type": "code", "execution_count": 98, "id": "71e6b3a6-fc2b-4555-be00-ef7cef9c5789", "metadata": {}, "outputs": [], "source": [ "HEX.columns = [\"CODIGO MUNICIPIO\", \"NOMBRE MUNICIPIO\", \"NIDOS 2020\"] # change column names to Spanish (Competition template)" ] }, { "cell_type": "code", "execution_count": 99, "id": "59b95188-bbb5-45c2-9ed3-14bf93807813", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Submission form Shape is (112, 3)\n", "Number of Municipalities is 112\n", "The Total 2020 Nests' Prediction is 2826\n" ] }, { "data": { "text/plain": [ "'All Municipality Names and Codes to be submitted match the Template'" ] }, "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ "check_data(HEX)" ] }, { "cell_type": "markdown", "id": "cc759394-8282-496d-9901-090c58e165e2", "metadata": {}, "source": [ "### Export dataset for submission" ] }, { "cell_type": "code", "execution_count": 100, "id": "22a4eacd-46a5-45d6-a4de-bc3fe235b01a", "metadata": {}, "outputs": [], "source": [ "HEX.to_csv('WaspBusters_20210609_135-mXGB-prodigal-GSCV-noSort-FI-0s.csv', index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 5 }