From 39d8c4dc2c6f66af5243c6256740399cab272353 Mon Sep 17 00:00:00 2001 From: NicolasPce Date: Thu, 13 Jan 2022 17:57:06 +0100 Subject: [PATCH] =?UTF-8?q?Nicol=C3=A1s=20-=20Final=203=20horas?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Main Cobify-checkpoint.ipynb | 992 ++++++++++++++++++ .../MainCobify-checkpoint.ipynb | 992 ++++++++++++++++++ MainCobify.ipynb | 992 ++++++++++++++++++ 3 files changed, 2976 insertions(+) create mode 100644 .ipynb_checkpoints/Main Cobify-checkpoint.ipynb create mode 100644 .ipynb_checkpoints/MainCobify-checkpoint.ipynb create mode 100644 MainCobify.ipynb diff --git a/.ipynb_checkpoints/Main Cobify-checkpoint.ipynb b/.ipynb_checkpoints/Main Cobify-checkpoint.ipynb new file mode 100644 index 0000000..948486c --- /dev/null +++ b/.ipynb_checkpoints/Main Cobify-checkpoint.ipynb @@ -0,0 +1,992 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "19e233cb", + "metadata": {}, + "source": [ + "### Importamos librerías" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "id": "f04297b3", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "from sklearn import metrics\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.model_selection import train_test_split\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")\n", + "from sklearn.svm import SVC\n", + "from sklearn.metrics import mean_squared_error, r2_score\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.ensemble import GradientBoostingRegressor\n", + "\n", + "\n", + "from matplotlib import pyplot as plt\n", + "%matplotlib inline\n", + "%config Inlinebackend.figure_format = 'retina'\n", + "\n", + "import seaborn as sns\n", + "sns.set_context('poster')\n", + "sns.set(rc={'figure.figsize': (16., 9.)})\n", + "sns.set_style('whitegrid')\n", + "\n", + "import numpy as np\n" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "id": "21019f1f", + "metadata": {}, + "outputs": [], + "source": [ + "# Nos traemos el data set, teniendo en cuenta que los decimales están separados por coma\n", + "df = pd.read_csv('measurements.csv',decimal=',')\n" + ] + }, + { + "cell_type": "markdown", + "id": "089a188e", + "metadata": {}, + "source": [ + "### Analísis y limpieza de datos" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "id": "ed6cfec1", + "metadata": {}, + "outputs": [], + "source": [ + "#df2 = pd.readcsv(\"measurements2.xlsx\",encoding = \"ISO-8859-1\")" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "id": "6c5970a4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(388, 12)" + ] + }, + "execution_count": 120, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "id": "1b7b5cd1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
distanceconsumespeedtemp_insidetemp_outsidespecialsgas_typeACrainsunrefill litersrefill gas
028.05.02621.512NaNE1000045.0E10
112.04.23021.513NaNE10000NaNNaN
211.25.53821.515NaNE10000NaNNaN
\n", + "
" + ], + "text/plain": [ + " distance consume speed temp_inside temp_outside specials gas_type AC \\\n", + "0 28.0 5.0 26 21.5 12 NaN E10 0 \n", + "1 12.0 4.2 30 21.5 13 NaN E10 0 \n", + "2 11.2 5.5 38 21.5 15 NaN E10 0 \n", + "\n", + " rain sun refill liters refill gas \n", + "0 0 0 45.0 E10 \n", + "1 0 0 NaN NaN \n", + "2 0 0 NaN NaN " + ] + }, + "execution_count": 121, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "id": "7655fa36", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "distance 0\n", + "consume 0\n", + "speed 0\n", + "temp_inside 12\n", + "temp_outside 0\n", + "specials 295\n", + "gas_type 0\n", + "AC 0\n", + "rain 0\n", + "sun 0\n", + "refill liters 375\n", + "refill gas 375\n", + "dtype: int64" + ] + }, + "execution_count": 122, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "id": "4dd5bb12", + "metadata": {}, + "outputs": [], + "source": [ + "#Eliminamos la columna refill gas, ya que es la misma que gas_type, aunque en algunas falten datos se observa como cambia a la par\n", + "#Elimanamos tambien la columna specials, porque nos da la misma información que las columna AC, rain y sun\n", + "df.drop(['refill gas', 'specials'], axis = 1 , inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "id": "ab3d31ac", + "metadata": {}, + "outputs": [], + "source": [ + "#Para la columna refill liters le asignamos en los valores que faltan el valor que coincide con el gas_type" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "id": "86c99310", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "distance 0\n", + "consume 0\n", + "speed 0\n", + "temp_inside 12\n", + "temp_outside 0\n", + "gas_type 0\n", + "AC 0\n", + "rain 0\n", + "sun 0\n", + "refill liters 375\n", + "dtype: int64" + ] + }, + "execution_count": 125, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "id": "4eb23638", + "metadata": {}, + "outputs": [], + "source": [ + "#Para los 12 datos de temperatura interior que nos faltan al ser un 3% de la muestra le asignaremos el valor medio" + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "id": "6cfa05b0", + "metadata": {}, + "outputs": [], + "source": [ + "df['temp_inside'] = df['temp_inside'].fillna(22)" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "id": "a0f6405a", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
distanceconsumespeedtemp_insidetemp_outsidegas_typeACrainsunrefill liters
028.05.02621.512E1000045.0
112.04.23021.513E1000045.0
211.25.53821.515E1000045.0
\n", + "
" + ], + "text/plain": [ + " distance consume speed temp_inside temp_outside gas_type AC rain \\\n", + "0 28.0 5.0 26 21.5 12 E10 0 0 \n", + "1 12.0 4.2 30 21.5 13 E10 0 0 \n", + "2 11.2 5.5 38 21.5 15 E10 0 0 \n", + "\n", + " sun refill liters \n", + "0 0 45.0 \n", + "1 0 45.0 \n", + "2 0 45.0 " + ] + }, + "execution_count": 128, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Para la columna refill liters, completamos el valor con el valor previo, ya que también parece ir correlacionada con elgas type\n", + "df = df.fillna(method='ffill')\n", + "df.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "id": "305762b9", + "metadata": {}, + "outputs": [], + "source": [ + "#Normalizamos los dos tipos de combustible para facilitar en análisis de su correlación\n", + "\n", + "combus= {\"E10\": 1,\n", + " \"SP98\": 0,\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "id": "e77d01d6", + "metadata": {}, + "outputs": [], + "source": [ + "df.gas_type = df.gas_type.map(combus)" + ] + }, + { + "cell_type": "markdown", + "id": "7e97b7b1", + "metadata": {}, + "source": [ + "### Correlación" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "id": "414cc0af", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
distanceconsumespeedtemp_insidetemp_outsidegas_typeACrainsunrefill liters
distance1.000000-0.1289670.5622990.0747340.0881750.053411-0.025738-0.0197910.081120-0.097117
consume-0.1289671.000000-0.227866-0.161470-0.3208110.0153270.0965910.248118-0.1706670.139765
speed0.562299-0.2278661.0000000.0591390.0154110.097360-0.0354080.0094890.081618-0.086277
temp_inside0.074734-0.1614700.0591391.0000000.359649-0.0109210.296719-0.0372040.242237-0.077320
temp_outside0.088175-0.3208110.0154110.3596491.000000-0.1487050.167562-0.1863150.346903-0.356930
gas_type0.0534110.0153270.097360-0.010921-0.1487051.000000-0.105285-0.060328-0.0227610.133453
AC-0.0257380.096591-0.0354080.2967190.167562-0.1052851.0000000.2429150.088598-0.079404
rain-0.0197910.2481180.009489-0.037204-0.186315-0.0603280.2429151.000000-0.112650-0.083591
sun0.081120-0.1706670.0816180.2422370.346903-0.0227610.088598-0.1126501.000000-0.055477
refill liters-0.0971170.139765-0.086277-0.077320-0.3569300.133453-0.079404-0.083591-0.0554771.000000
\n", + "
" + ], + "text/plain": [ + " distance consume speed temp_inside temp_outside \\\n", + "distance 1.000000 -0.128967 0.562299 0.074734 0.088175 \n", + "consume -0.128967 1.000000 -0.227866 -0.161470 -0.320811 \n", + "speed 0.562299 -0.227866 1.000000 0.059139 0.015411 \n", + "temp_inside 0.074734 -0.161470 0.059139 1.000000 0.359649 \n", + "temp_outside 0.088175 -0.320811 0.015411 0.359649 1.000000 \n", + "gas_type 0.053411 0.015327 0.097360 -0.010921 -0.148705 \n", + "AC -0.025738 0.096591 -0.035408 0.296719 0.167562 \n", + "rain -0.019791 0.248118 0.009489 -0.037204 -0.186315 \n", + "sun 0.081120 -0.170667 0.081618 0.242237 0.346903 \n", + "refill liters -0.097117 0.139765 -0.086277 -0.077320 -0.356930 \n", + "\n", + " gas_type AC rain sun refill liters \n", + "distance 0.053411 -0.025738 -0.019791 0.081120 -0.097117 \n", + "consume 0.015327 0.096591 0.248118 -0.170667 0.139765 \n", + "speed 0.097360 -0.035408 0.009489 0.081618 -0.086277 \n", + "temp_inside -0.010921 0.296719 -0.037204 0.242237 -0.077320 \n", + "temp_outside -0.148705 0.167562 -0.186315 0.346903 -0.356930 \n", + "gas_type 1.000000 -0.105285 -0.060328 -0.022761 0.133453 \n", + "AC -0.105285 1.000000 0.242915 0.088598 -0.079404 \n", + "rain -0.060328 0.242915 1.000000 -0.112650 -0.083591 \n", + "sun -0.022761 0.088598 -0.112650 1.000000 -0.055477 \n", + "refill liters 0.133453 -0.079404 -0.083591 -0.055477 1.000000 " + ] + }, + "execution_count": 131, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#observamos la colerracion\n", + "corre = df.corr()\n", + "corre" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "id": "8287e7bb", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":1: DeprecationWarning: `np.bool` is a deprecated alias for the builtin `bool`. To silence this warning, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " mask = np.triu(np.ones_like(corre, dtype=np.bool))\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "mask = np.triu(np.ones_like(corre, dtype=np.bool))\n", + "heatmap = sns.heatmap(corre, mask=mask, vmin=-1, vmax=1, annot=True, cmap='BrBG');" + ] + }, + { + "cell_type": "markdown", + "id": "ccf99683", + "metadata": {}, + "source": [ + "#### Observamos que el consumo se relaciona principal y negativamente con la velocidad y la temperatura exterior, sin ser ninguna superior al 0.5 (valor absoluto). Además, la velocidad se correlaciona positivamente con la distancia (trayectos largos a mayor velocidad). La temperatura exterior se correlaciona positicamente con el sol, y la temperatura interior, y negativamente con los litros de relleno. No existen otras correlaciones superiores a 0.35\n" + ] + }, + { + "cell_type": "markdown", + "id": "b7a31b48", + "metadata": {}, + "source": [ + "### Predicciones consumo" + ] + }, + { + "cell_type": "code", + "execution_count": 164, + "id": "d668efbe", + "metadata": {}, + "outputs": [], + "source": [ + "#Probamos la regresión líneal" + ] + }, + { + "cell_type": "code", + "execution_count": 191, + "id": "fd46bd3f", + "metadata": {}, + "outputs": [], + "source": [ + "X = df.drop(\"consume\", axis=1)\n", + "y = df.consume\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)" + ] + }, + { + "cell_type": "code", + "execution_count": 192, + "id": "22d7de78", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LinearRegression()" + ] + }, + "execution_count": 192, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lr = LinearRegression()\n", + "lr.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 193, + "id": "dd9af45a", + "metadata": {}, + "outputs": [], + "source": [ + "y_pred_train = lr.predict(X_train)\n", + "y_pred_test = lr.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 194, + "id": "5a5aabcc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.22815357510490153" + ] + }, + "execution_count": 194, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_train_predict = lr.predict(X_train)\n", + "metrics.r2_score(y_train, y_train_predict)" + ] + }, + { + "cell_type": "code", + "execution_count": 195, + "id": "e9fb732c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.1020698110029008" + ] + }, + "execution_count": 195, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_test_predict = lr.predict(X_test)\n", + "metrics.r2_score(y_test, y_test_predict)" + ] + }, + { + "cell_type": "code", + "execution_count": 196, + "id": "984494b0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9154795542880089" + ] + }, + "execution_count": 196, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mean_squared_error(y_train, y_train_predict)**0.5" + ] + }, + { + "cell_type": "code", + "execution_count": 197, + "id": "69d01cd4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9375077349245255" + ] + }, + "execution_count": 197, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "metrics.mean_squared_error(y_test, y_test_predict)**0.5" + ] + }, + { + "cell_type": "code", + "execution_count": 198, + "id": "e3fc29b2", + "metadata": {}, + "outputs": [], + "source": [ + "#Probamos el random forest" + ] + }, + { + "cell_type": "code", + "execution_count": 199, + "id": "c1be8ed3", + "metadata": {}, + "outputs": [], + "source": [ + "X = df.drop(\"consume\", axis=1)\n", + "y = df.consume\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)" + ] + }, + { + "cell_type": "code", + "execution_count": 200, + "id": "ac6a5ed5", + "metadata": {}, + "outputs": [], + "source": [ + "forest = RandomForestRegressor()" + ] + }, + { + "cell_type": "code", + "execution_count": 201, + "id": "a84cc540", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "RandomForestRegressor()" + ] + }, + "execution_count": 201, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "forest.fit(X_train,y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 202, + "id": "5bb449ad", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.24289915602982237" + ] + }, + "execution_count": 202, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mean_squared_error(y_train,forest.predict(X_train)).round(3)**0.5" + ] + }, + { + "cell_type": "code", + "execution_count": 203, + "id": "a17fee39", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.6985699678629192" + ] + }, + "execution_count": 203, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mean_squared_error(y_test,forest.predict(X_test)).round(3)**0.5" + ] + }, + { + "cell_type": "markdown", + "id": "027fc881", + "metadata": {}, + "source": [ + "#### Obtenemos un MSE más bajo en el random forest, 0.24 en el test, y de 0.69 en el train, aún así no parece un gran modelo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c402484a", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c36a66d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/.ipynb_checkpoints/MainCobify-checkpoint.ipynb b/.ipynb_checkpoints/MainCobify-checkpoint.ipynb new file mode 100644 index 0000000..69d9ab2 --- /dev/null +++ b/.ipynb_checkpoints/MainCobify-checkpoint.ipynb @@ -0,0 +1,992 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "25980055", + "metadata": {}, + "source": [ + "### Importamos librerías" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "id": "f63a2ff3", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "from sklearn import metrics\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.model_selection import train_test_split\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")\n", + "from sklearn.svm import SVC\n", + "from sklearn.metrics import mean_squared_error, r2_score\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.ensemble import GradientBoostingRegressor\n", + "\n", + "\n", + "from matplotlib import pyplot as plt\n", + "%matplotlib inline\n", + "%config Inlinebackend.figure_format = 'retina'\n", + "\n", + "import seaborn as sns\n", + "sns.set_context('poster')\n", + "sns.set(rc={'figure.figsize': (16., 9.)})\n", + "sns.set_style('whitegrid')\n", + "\n", + "import numpy as np\n" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "id": "e6e0415c", + "metadata": {}, + "outputs": [], + "source": [ + "# Nos traemos el data set, teniendo en cuenta que los decimales están separados por coma\n", + "df = pd.read_csv('measurements.csv',decimal=',')\n" + ] + }, + { + "cell_type": "markdown", + "id": "2ce8eeeb", + "metadata": {}, + "source": [ + "### Analísis y limpieza de datos" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "id": "f3c01757", + "metadata": {}, + "outputs": [], + "source": [ + "#df2 = pd.readcsv(\"measurements2.xlsx\",encoding = \"ISO-8859-1\")" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "id": "6275ac7c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(388, 12)" + ] + }, + "execution_count": 120, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "id": "eaf5b089", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
distanceconsumespeedtemp_insidetemp_outsidespecialsgas_typeACrainsunrefill litersrefill gas
028.05.02621.512NaNE1000045.0E10
112.04.23021.513NaNE10000NaNNaN
211.25.53821.515NaNE10000NaNNaN
\n", + "
" + ], + "text/plain": [ + " distance consume speed temp_inside temp_outside specials gas_type AC \\\n", + "0 28.0 5.0 26 21.5 12 NaN E10 0 \n", + "1 12.0 4.2 30 21.5 13 NaN E10 0 \n", + "2 11.2 5.5 38 21.5 15 NaN E10 0 \n", + "\n", + " rain sun refill liters refill gas \n", + "0 0 0 45.0 E10 \n", + "1 0 0 NaN NaN \n", + "2 0 0 NaN NaN " + ] + }, + "execution_count": 121, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "id": "43d9089e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "distance 0\n", + "consume 0\n", + "speed 0\n", + "temp_inside 12\n", + "temp_outside 0\n", + "specials 295\n", + "gas_type 0\n", + "AC 0\n", + "rain 0\n", + "sun 0\n", + "refill liters 375\n", + "refill gas 375\n", + "dtype: int64" + ] + }, + "execution_count": 122, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "id": "db8bcbb5", + "metadata": {}, + "outputs": [], + "source": [ + "#Eliminamos la columna refill gas, ya que es la misma que gas_type, aunque en algunas falten datos se observa como cambia a la par\n", + "#Elimanamos tambien la columna specials, porque nos da la misma información que las columna AC, rain y sun\n", + "df.drop(['refill gas', 'specials'], axis = 1 , inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "id": "9464bdde", + "metadata": {}, + "outputs": [], + "source": [ + "#Para la columna refill liters le asignamos en los valores que faltan el valor que coincide con el gas_type" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "id": "f0459a09", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "distance 0\n", + "consume 0\n", + "speed 0\n", + "temp_inside 12\n", + "temp_outside 0\n", + "gas_type 0\n", + "AC 0\n", + "rain 0\n", + "sun 0\n", + "refill liters 375\n", + "dtype: int64" + ] + }, + "execution_count": 125, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "id": "04f84b0a", + "metadata": {}, + "outputs": [], + "source": [ + "#Para los 12 datos de temperatura interior que nos faltan al ser un 3% de la muestra le asignaremos el valor medio" + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "id": "a5ecb335", + "metadata": {}, + "outputs": [], + "source": [ + "df['temp_inside'] = df['temp_inside'].fillna(22)" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "id": "f24f2c8b", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
distanceconsumespeedtemp_insidetemp_outsidegas_typeACrainsunrefill liters
028.05.02621.512E1000045.0
112.04.23021.513E1000045.0
211.25.53821.515E1000045.0
\n", + "
" + ], + "text/plain": [ + " distance consume speed temp_inside temp_outside gas_type AC rain \\\n", + "0 28.0 5.0 26 21.5 12 E10 0 0 \n", + "1 12.0 4.2 30 21.5 13 E10 0 0 \n", + "2 11.2 5.5 38 21.5 15 E10 0 0 \n", + "\n", + " sun refill liters \n", + "0 0 45.0 \n", + "1 0 45.0 \n", + "2 0 45.0 " + ] + }, + "execution_count": 128, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Para la columna refill liters, completamos el valor con el valor previo, ya que también parece ir correlacionada con elgas type\n", + "df = df.fillna(method='ffill')\n", + "df.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "id": "df948205", + "metadata": {}, + "outputs": [], + "source": [ + "#Normalizamos los dos tipos de combustible para facilitar en análisis de su correlación\n", + "\n", + "combus= {\"E10\": 1,\n", + " \"SP98\": 0,\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "id": "df8fdbdd", + "metadata": {}, + "outputs": [], + "source": [ + "df.gas_type = df.gas_type.map(combus)" + ] + }, + { + "cell_type": "markdown", + "id": "e30f7ca3", + "metadata": {}, + "source": [ + "### Correlación" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "id": "31628178", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
distanceconsumespeedtemp_insidetemp_outsidegas_typeACrainsunrefill liters
distance1.000000-0.1289670.5622990.0747340.0881750.053411-0.025738-0.0197910.081120-0.097117
consume-0.1289671.000000-0.227866-0.161470-0.3208110.0153270.0965910.248118-0.1706670.139765
speed0.562299-0.2278661.0000000.0591390.0154110.097360-0.0354080.0094890.081618-0.086277
temp_inside0.074734-0.1614700.0591391.0000000.359649-0.0109210.296719-0.0372040.242237-0.077320
temp_outside0.088175-0.3208110.0154110.3596491.000000-0.1487050.167562-0.1863150.346903-0.356930
gas_type0.0534110.0153270.097360-0.010921-0.1487051.000000-0.105285-0.060328-0.0227610.133453
AC-0.0257380.096591-0.0354080.2967190.167562-0.1052851.0000000.2429150.088598-0.079404
rain-0.0197910.2481180.009489-0.037204-0.186315-0.0603280.2429151.000000-0.112650-0.083591
sun0.081120-0.1706670.0816180.2422370.346903-0.0227610.088598-0.1126501.000000-0.055477
refill liters-0.0971170.139765-0.086277-0.077320-0.3569300.133453-0.079404-0.083591-0.0554771.000000
\n", + "
" + ], + "text/plain": [ + " distance consume speed temp_inside temp_outside \\\n", + "distance 1.000000 -0.128967 0.562299 0.074734 0.088175 \n", + "consume -0.128967 1.000000 -0.227866 -0.161470 -0.320811 \n", + "speed 0.562299 -0.227866 1.000000 0.059139 0.015411 \n", + "temp_inside 0.074734 -0.161470 0.059139 1.000000 0.359649 \n", + "temp_outside 0.088175 -0.320811 0.015411 0.359649 1.000000 \n", + "gas_type 0.053411 0.015327 0.097360 -0.010921 -0.148705 \n", + "AC -0.025738 0.096591 -0.035408 0.296719 0.167562 \n", + "rain -0.019791 0.248118 0.009489 -0.037204 -0.186315 \n", + "sun 0.081120 -0.170667 0.081618 0.242237 0.346903 \n", + "refill liters -0.097117 0.139765 -0.086277 -0.077320 -0.356930 \n", + "\n", + " gas_type AC rain sun refill liters \n", + "distance 0.053411 -0.025738 -0.019791 0.081120 -0.097117 \n", + "consume 0.015327 0.096591 0.248118 -0.170667 0.139765 \n", + "speed 0.097360 -0.035408 0.009489 0.081618 -0.086277 \n", + "temp_inside -0.010921 0.296719 -0.037204 0.242237 -0.077320 \n", + "temp_outside -0.148705 0.167562 -0.186315 0.346903 -0.356930 \n", + "gas_type 1.000000 -0.105285 -0.060328 -0.022761 0.133453 \n", + "AC -0.105285 1.000000 0.242915 0.088598 -0.079404 \n", + "rain -0.060328 0.242915 1.000000 -0.112650 -0.083591 \n", + "sun -0.022761 0.088598 -0.112650 1.000000 -0.055477 \n", + "refill liters 0.133453 -0.079404 -0.083591 -0.055477 1.000000 " + ] + }, + "execution_count": 131, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#observamos la colerracion\n", + "corre = df.corr()\n", + "corre" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "id": "f3a239ad", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":1: DeprecationWarning: `np.bool` is a deprecated alias for the builtin `bool`. To silence this warning, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " mask = np.triu(np.ones_like(corre, dtype=np.bool))\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "mask = np.triu(np.ones_like(corre, dtype=np.bool))\n", + "heatmap = sns.heatmap(corre, mask=mask, vmin=-1, vmax=1, annot=True, cmap='BrBG');" + ] + }, + { + "cell_type": "markdown", + "id": "118ff471", + "metadata": {}, + "source": [ + "#### Observamos que el consumo se relaciona principal y negativamente con la velocidad y la temperatura exterior, sin ser ninguna superior al 0.5 (valor absoluto). Además, la velocidad se correlaciona positivamente con la distancia (trayectos largos a mayor velocidad). La temperatura exterior se correlaciona positicamente con el sol, y la temperatura interior, y negativamente con los litros de relleno. No existen otras correlaciones superiores a 0.35\n" + ] + }, + { + "cell_type": "markdown", + "id": "d72e3839", + "metadata": {}, + "source": [ + "### Predicciones consumo" + ] + }, + { + "cell_type": "code", + "execution_count": 164, + "id": "1f31b290", + "metadata": {}, + "outputs": [], + "source": [ + "#Probamos la regresión líneal" + ] + }, + { + "cell_type": "code", + "execution_count": 191, + "id": "305a98fa", + "metadata": {}, + "outputs": [], + "source": [ + "X = df.drop(\"consume\", axis=1)\n", + "y = df.consume\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)" + ] + }, + { + "cell_type": "code", + "execution_count": 192, + "id": "5cbefd90", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LinearRegression()" + ] + }, + "execution_count": 192, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lr = LinearRegression()\n", + "lr.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 193, + "id": "4451dfa4", + "metadata": {}, + "outputs": [], + "source": [ + "y_pred_train = lr.predict(X_train)\n", + "y_pred_test = lr.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 194, + "id": "8bbff554", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.22815357510490153" + ] + }, + "execution_count": 194, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_train_predict = lr.predict(X_train)\n", + "metrics.r2_score(y_train, y_train_predict)" + ] + }, + { + "cell_type": "code", + "execution_count": 195, + "id": "d2968680", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.1020698110029008" + ] + }, + "execution_count": 195, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_test_predict = lr.predict(X_test)\n", + "metrics.r2_score(y_test, y_test_predict)" + ] + }, + { + "cell_type": "code", + "execution_count": 196, + "id": "0c83c6cb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9154795542880089" + ] + }, + "execution_count": 196, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mean_squared_error(y_train, y_train_predict)**0.5" + ] + }, + { + "cell_type": "code", + "execution_count": 197, + "id": "1c1920a6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9375077349245255" + ] + }, + "execution_count": 197, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "metrics.mean_squared_error(y_test, y_test_predict)**0.5" + ] + }, + { + "cell_type": "code", + "execution_count": 198, + "id": "2e93fef8", + "metadata": {}, + "outputs": [], + "source": [ + "#Probamos el random forest" + ] + }, + { + "cell_type": "code", + "execution_count": 199, + "id": "09f429a5", + "metadata": {}, + "outputs": [], + "source": [ + "X = df.drop(\"consume\", axis=1)\n", + "y = df.consume\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)" + ] + }, + { + "cell_type": "code", + "execution_count": 200, + "id": "d9ddef9b", + "metadata": {}, + "outputs": [], + "source": [ + "forest = RandomForestRegressor()" + ] + }, + { + "cell_type": "code", + "execution_count": 201, + "id": "e44a95a9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "RandomForestRegressor()" + ] + }, + "execution_count": 201, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "forest.fit(X_train,y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 202, + "id": "ca9015d1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.24289915602982237" + ] + }, + "execution_count": 202, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mean_squared_error(y_train,forest.predict(X_train)).round(3)**0.5" + ] + }, + { + "cell_type": "code", + "execution_count": 203, + "id": "84b57363", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.6985699678629192" + ] + }, + "execution_count": 203, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mean_squared_error(y_test,forest.predict(X_test)).round(3)**0.5" + ] + }, + { + "cell_type": "markdown", + "id": "793a694f", + "metadata": {}, + "source": [ + "#### Obtenemos un MSE más bajo en el random forest, 0.24 en el test, y de 0.69 en el train, aún así no parece un gran modelo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2583fd64", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be457321", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/MainCobify.ipynb b/MainCobify.ipynb new file mode 100644 index 0000000..69d9ab2 --- /dev/null +++ b/MainCobify.ipynb @@ -0,0 +1,992 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "25980055", + "metadata": {}, + "source": [ + "### Importamos librerías" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "id": "f63a2ff3", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "from sklearn import metrics\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.model_selection import train_test_split\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")\n", + "from sklearn.svm import SVC\n", + "from sklearn.metrics import mean_squared_error, r2_score\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.ensemble import GradientBoostingRegressor\n", + "\n", + "\n", + "from matplotlib import pyplot as plt\n", + "%matplotlib inline\n", + "%config Inlinebackend.figure_format = 'retina'\n", + "\n", + "import seaborn as sns\n", + "sns.set_context('poster')\n", + "sns.set(rc={'figure.figsize': (16., 9.)})\n", + "sns.set_style('whitegrid')\n", + "\n", + "import numpy as np\n" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "id": "e6e0415c", + "metadata": {}, + "outputs": [], + "source": [ + "# Nos traemos el data set, teniendo en cuenta que los decimales están separados por coma\n", + "df = pd.read_csv('measurements.csv',decimal=',')\n" + ] + }, + { + "cell_type": "markdown", + "id": "2ce8eeeb", + "metadata": {}, + "source": [ + "### Analísis y limpieza de datos" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "id": "f3c01757", + "metadata": {}, + "outputs": [], + "source": [ + "#df2 = pd.readcsv(\"measurements2.xlsx\",encoding = \"ISO-8859-1\")" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "id": "6275ac7c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(388, 12)" + ] + }, + "execution_count": 120, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "id": "eaf5b089", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
distanceconsumespeedtemp_insidetemp_outsidespecialsgas_typeACrainsunrefill litersrefill gas
028.05.02621.512NaNE1000045.0E10
112.04.23021.513NaNE10000NaNNaN
211.25.53821.515NaNE10000NaNNaN
\n", + "
" + ], + "text/plain": [ + " distance consume speed temp_inside temp_outside specials gas_type AC \\\n", + "0 28.0 5.0 26 21.5 12 NaN E10 0 \n", + "1 12.0 4.2 30 21.5 13 NaN E10 0 \n", + "2 11.2 5.5 38 21.5 15 NaN E10 0 \n", + "\n", + " rain sun refill liters refill gas \n", + "0 0 0 45.0 E10 \n", + "1 0 0 NaN NaN \n", + "2 0 0 NaN NaN " + ] + }, + "execution_count": 121, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "id": "43d9089e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "distance 0\n", + "consume 0\n", + "speed 0\n", + "temp_inside 12\n", + "temp_outside 0\n", + "specials 295\n", + "gas_type 0\n", + "AC 0\n", + "rain 0\n", + "sun 0\n", + "refill liters 375\n", + "refill gas 375\n", + "dtype: int64" + ] + }, + "execution_count": 122, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "id": "db8bcbb5", + "metadata": {}, + "outputs": [], + "source": [ + "#Eliminamos la columna refill gas, ya que es la misma que gas_type, aunque en algunas falten datos se observa como cambia a la par\n", + "#Elimanamos tambien la columna specials, porque nos da la misma información que las columna AC, rain y sun\n", + "df.drop(['refill gas', 'specials'], axis = 1 , inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "id": "9464bdde", + "metadata": {}, + "outputs": [], + "source": [ + "#Para la columna refill liters le asignamos en los valores que faltan el valor que coincide con el gas_type" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "id": "f0459a09", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "distance 0\n", + "consume 0\n", + "speed 0\n", + "temp_inside 12\n", + "temp_outside 0\n", + "gas_type 0\n", + "AC 0\n", + "rain 0\n", + "sun 0\n", + "refill liters 375\n", + "dtype: int64" + ] + }, + "execution_count": 125, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "id": "04f84b0a", + "metadata": {}, + "outputs": [], + "source": [ + "#Para los 12 datos de temperatura interior que nos faltan al ser un 3% de la muestra le asignaremos el valor medio" + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "id": "a5ecb335", + "metadata": {}, + "outputs": [], + "source": [ + "df['temp_inside'] = df['temp_inside'].fillna(22)" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "id": "f24f2c8b", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
distanceconsumespeedtemp_insidetemp_outsidegas_typeACrainsunrefill liters
028.05.02621.512E1000045.0
112.04.23021.513E1000045.0
211.25.53821.515E1000045.0
\n", + "
" + ], + "text/plain": [ + " distance consume speed temp_inside temp_outside gas_type AC rain \\\n", + "0 28.0 5.0 26 21.5 12 E10 0 0 \n", + "1 12.0 4.2 30 21.5 13 E10 0 0 \n", + "2 11.2 5.5 38 21.5 15 E10 0 0 \n", + "\n", + " sun refill liters \n", + "0 0 45.0 \n", + "1 0 45.0 \n", + "2 0 45.0 " + ] + }, + "execution_count": 128, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Para la columna refill liters, completamos el valor con el valor previo, ya que también parece ir correlacionada con elgas type\n", + "df = df.fillna(method='ffill')\n", + "df.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "id": "df948205", + "metadata": {}, + "outputs": [], + "source": [ + "#Normalizamos los dos tipos de combustible para facilitar en análisis de su correlación\n", + "\n", + "combus= {\"E10\": 1,\n", + " \"SP98\": 0,\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "id": "df8fdbdd", + "metadata": {}, + "outputs": [], + "source": [ + "df.gas_type = df.gas_type.map(combus)" + ] + }, + { + "cell_type": "markdown", + "id": "e30f7ca3", + "metadata": {}, + "source": [ + "### Correlación" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "id": "31628178", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
distanceconsumespeedtemp_insidetemp_outsidegas_typeACrainsunrefill liters
distance1.000000-0.1289670.5622990.0747340.0881750.053411-0.025738-0.0197910.081120-0.097117
consume-0.1289671.000000-0.227866-0.161470-0.3208110.0153270.0965910.248118-0.1706670.139765
speed0.562299-0.2278661.0000000.0591390.0154110.097360-0.0354080.0094890.081618-0.086277
temp_inside0.074734-0.1614700.0591391.0000000.359649-0.0109210.296719-0.0372040.242237-0.077320
temp_outside0.088175-0.3208110.0154110.3596491.000000-0.1487050.167562-0.1863150.346903-0.356930
gas_type0.0534110.0153270.097360-0.010921-0.1487051.000000-0.105285-0.060328-0.0227610.133453
AC-0.0257380.096591-0.0354080.2967190.167562-0.1052851.0000000.2429150.088598-0.079404
rain-0.0197910.2481180.009489-0.037204-0.186315-0.0603280.2429151.000000-0.112650-0.083591
sun0.081120-0.1706670.0816180.2422370.346903-0.0227610.088598-0.1126501.000000-0.055477
refill liters-0.0971170.139765-0.086277-0.077320-0.3569300.133453-0.079404-0.083591-0.0554771.000000
\n", + "
" + ], + "text/plain": [ + " distance consume speed temp_inside temp_outside \\\n", + "distance 1.000000 -0.128967 0.562299 0.074734 0.088175 \n", + "consume -0.128967 1.000000 -0.227866 -0.161470 -0.320811 \n", + "speed 0.562299 -0.227866 1.000000 0.059139 0.015411 \n", + "temp_inside 0.074734 -0.161470 0.059139 1.000000 0.359649 \n", + "temp_outside 0.088175 -0.320811 0.015411 0.359649 1.000000 \n", + "gas_type 0.053411 0.015327 0.097360 -0.010921 -0.148705 \n", + "AC -0.025738 0.096591 -0.035408 0.296719 0.167562 \n", + "rain -0.019791 0.248118 0.009489 -0.037204 -0.186315 \n", + "sun 0.081120 -0.170667 0.081618 0.242237 0.346903 \n", + "refill liters -0.097117 0.139765 -0.086277 -0.077320 -0.356930 \n", + "\n", + " gas_type AC rain sun refill liters \n", + "distance 0.053411 -0.025738 -0.019791 0.081120 -0.097117 \n", + "consume 0.015327 0.096591 0.248118 -0.170667 0.139765 \n", + "speed 0.097360 -0.035408 0.009489 0.081618 -0.086277 \n", + "temp_inside -0.010921 0.296719 -0.037204 0.242237 -0.077320 \n", + "temp_outside -0.148705 0.167562 -0.186315 0.346903 -0.356930 \n", + "gas_type 1.000000 -0.105285 -0.060328 -0.022761 0.133453 \n", + "AC -0.105285 1.000000 0.242915 0.088598 -0.079404 \n", + "rain -0.060328 0.242915 1.000000 -0.112650 -0.083591 \n", + "sun -0.022761 0.088598 -0.112650 1.000000 -0.055477 \n", + "refill liters 0.133453 -0.079404 -0.083591 -0.055477 1.000000 " + ] + }, + "execution_count": 131, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#observamos la colerracion\n", + "corre = df.corr()\n", + "corre" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "id": "f3a239ad", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":1: DeprecationWarning: `np.bool` is a deprecated alias for the builtin `bool`. To silence this warning, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " mask = np.triu(np.ones_like(corre, dtype=np.bool))\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "mask = np.triu(np.ones_like(corre, dtype=np.bool))\n", + "heatmap = sns.heatmap(corre, mask=mask, vmin=-1, vmax=1, annot=True, cmap='BrBG');" + ] + }, + { + "cell_type": "markdown", + "id": "118ff471", + "metadata": {}, + "source": [ + "#### Observamos que el consumo se relaciona principal y negativamente con la velocidad y la temperatura exterior, sin ser ninguna superior al 0.5 (valor absoluto). Además, la velocidad se correlaciona positivamente con la distancia (trayectos largos a mayor velocidad). La temperatura exterior se correlaciona positicamente con el sol, y la temperatura interior, y negativamente con los litros de relleno. No existen otras correlaciones superiores a 0.35\n" + ] + }, + { + "cell_type": "markdown", + "id": "d72e3839", + "metadata": {}, + "source": [ + "### Predicciones consumo" + ] + }, + { + "cell_type": "code", + "execution_count": 164, + "id": "1f31b290", + "metadata": {}, + "outputs": [], + "source": [ + "#Probamos la regresión líneal" + ] + }, + { + "cell_type": "code", + "execution_count": 191, + "id": "305a98fa", + "metadata": {}, + "outputs": [], + "source": [ + "X = df.drop(\"consume\", axis=1)\n", + "y = df.consume\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)" + ] + }, + { + "cell_type": "code", + "execution_count": 192, + "id": "5cbefd90", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LinearRegression()" + ] + }, + "execution_count": 192, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lr = LinearRegression()\n", + "lr.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 193, + "id": "4451dfa4", + "metadata": {}, + "outputs": [], + "source": [ + "y_pred_train = lr.predict(X_train)\n", + "y_pred_test = lr.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 194, + "id": "8bbff554", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.22815357510490153" + ] + }, + "execution_count": 194, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_train_predict = lr.predict(X_train)\n", + "metrics.r2_score(y_train, y_train_predict)" + ] + }, + { + "cell_type": "code", + "execution_count": 195, + "id": "d2968680", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.1020698110029008" + ] + }, + "execution_count": 195, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_test_predict = lr.predict(X_test)\n", + "metrics.r2_score(y_test, y_test_predict)" + ] + }, + { + "cell_type": "code", + "execution_count": 196, + "id": "0c83c6cb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9154795542880089" + ] + }, + "execution_count": 196, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mean_squared_error(y_train, y_train_predict)**0.5" + ] + }, + { + "cell_type": "code", + "execution_count": 197, + "id": "1c1920a6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9375077349245255" + ] + }, + "execution_count": 197, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "metrics.mean_squared_error(y_test, y_test_predict)**0.5" + ] + }, + { + "cell_type": "code", + "execution_count": 198, + "id": "2e93fef8", + "metadata": {}, + "outputs": [], + "source": [ + "#Probamos el random forest" + ] + }, + { + "cell_type": "code", + "execution_count": 199, + "id": "09f429a5", + "metadata": {}, + "outputs": [], + "source": [ + "X = df.drop(\"consume\", axis=1)\n", + "y = df.consume\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)" + ] + }, + { + "cell_type": "code", + "execution_count": 200, + "id": "d9ddef9b", + "metadata": {}, + "outputs": [], + "source": [ + "forest = RandomForestRegressor()" + ] + }, + { + "cell_type": "code", + "execution_count": 201, + "id": "e44a95a9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "RandomForestRegressor()" + ] + }, + "execution_count": 201, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "forest.fit(X_train,y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 202, + "id": "ca9015d1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.24289915602982237" + ] + }, + "execution_count": 202, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mean_squared_error(y_train,forest.predict(X_train)).round(3)**0.5" + ] + }, + { + "cell_type": "code", + "execution_count": 203, + "id": "84b57363", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.6985699678629192" + ] + }, + "execution_count": 203, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mean_squared_error(y_test,forest.predict(X_test)).round(3)**0.5" + ] + }, + { + "cell_type": "markdown", + "id": "793a694f", + "metadata": {}, + "source": [ + "#### Obtenemos un MSE más bajo en el random forest, 0.24 en el test, y de 0.69 en el train, aún así no parece un gran modelo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2583fd64", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be457321", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}