VictorRodriguezIronhack · Pomilusky · Mar 17, 2022
diff --git a/Code/0.Exploration.ipynb b/Code/0.Exploration.ipynb
diff --git a/Code/1.Transformation.ipynb b/Code/1.Transformation.ipynb
@@ -0,0 +1,343 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 104,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# All the libraries we are using:\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import requests\n",
+    "from bs4 import BeautifulSoup\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 105,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# let's load the csv file into a dataframe\n",
+    "df = pd.read_csv('../Data/cleaned_mes.csv', index_col='Unnamed: 0') # To avoid using index_col here I could have used the argument index=False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 106,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>distance</th>\n",
+       "      <th>consume</th>\n",
+       "      <th>speed</th>\n",
+       "      <th>temp_inside</th>\n",
+       "      <th>temp_outside</th>\n",
+       "      <th>gas_type</th>\n",
+       "      <th>AC</th>\n",
+       "      <th>rain</th>\n",
+       "      <th>sun</th>\n",
+       "      <th>snow</th>\n",
+       "      <th>temp_gradient</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>313</th>\n",
+       "      <td>11.3</td>\n",
+       "      <td>4.3</td>\n",
+       "      <td>38</td>\n",
+       "      <td>22.0</td>\n",
+       "      <td>17</td>\n",
+       "      <td>SP98</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>-5.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>339</th>\n",
+       "      <td>15.4</td>\n",
+       "      <td>4.1</td>\n",
+       "      <td>45</td>\n",
+       "      <td>22.0</td>\n",
+       "      <td>24</td>\n",
+       "      <td>E10</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>2.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>298</th>\n",
+       "      <td>16.3</td>\n",
+       "      <td>4.5</td>\n",
+       "      <td>58</td>\n",
+       "      <td>22.0</td>\n",
+       "      <td>16</td>\n",
+       "      <td>SP98</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>-6.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>70</th>\n",
+       "      <td>12.3</td>\n",
+       "      <td>5.2</td>\n",
+       "      <td>55</td>\n",
+       "      <td>21.5</td>\n",
+       "      <td>12</td>\n",
+       "      <td>SP98</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>-9.5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>279</th>\n",
+       "      <td>24.7</td>\n",
+       "      <td>4.5</td>\n",
+       "      <td>26</td>\n",
+       "      <td>22.0</td>\n",
+       "      <td>10</td>\n",
+       "      <td>SP98</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>-12.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>100</th>\n",
+       "      <td>5.3</td>\n",
+       "      <td>4.1</td>\n",
+       "      <td>34</td>\n",
+       "      <td>21.5</td>\n",
+       "      <td>9</td>\n",
+       "      <td>SP98</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>-12.5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>187</th>\n",
+       "      <td>12.3</td>\n",
+       "      <td>4.8</td>\n",
+       "      <td>41</td>\n",
+       "      <td>22.5</td>\n",
+       "      <td>7</td>\n",
+       "      <td>E10</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>-15.5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>179</th>\n",
+       "      <td>16.2</td>\n",
+       "      <td>5.2</td>\n",
+       "      <td>29</td>\n",
+       "      <td>21.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>E10</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>-21.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>90</th>\n",
+       "      <td>11.8</td>\n",
+       "      <td>4.3</td>\n",
+       "      <td>37</td>\n",
+       "      <td>20.0</td>\n",
+       "      <td>6</td>\n",
+       "      <td>SP98</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>-14.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>309</th>\n",
+       "      <td>31.9</td>\n",
+       "      <td>4.3</td>\n",
+       "      <td>33</td>\n",
+       "      <td>22.0</td>\n",
+       "      <td>16</td>\n",
+       "      <td>SP98</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>-6.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     distance  consume  speed  temp_inside  temp_outside gas_type     AC  \\\n",
+       "313      11.3      4.3     38         22.0            17     SP98  False   \n",
+       "339      15.4      4.1     45         22.0            24      E10  False   \n",
+       "298      16.3      4.5     58         22.0            16     SP98  False   \n",
+       "70       12.3      5.2     55         21.5            12     SP98  False   \n",
+       "279      24.7      4.5     26         22.0            10     SP98  False   \n",
+       "100       5.3      4.1     34         21.5             9     SP98  False   \n",
+       "187      12.3      4.8     41         22.5             7      E10  False   \n",
+       "179      16.2      5.2     29         21.0             0      E10  False   \n",
+       "90       11.8      4.3     37         20.0             6     SP98  False   \n",
+       "309      31.9      4.3     33         22.0            16     SP98  False   \n",
+       "\n",
+       "      rain    sun   snow  temp_gradient  \n",
+       "313  False  False  False           -5.0  \n",
+       "339  False  False  False            2.0  \n",
+       "298  False  False  False           -6.0  \n",
+       "70   False  False  False           -9.5  \n",
+       "279  False  False  False          -12.0  \n",
+       "100  False  False  False          -12.5  \n",
+       "187  False  False  False          -15.5  \n",
+       "179  False  False  False          -21.0  \n",
+       "90   False  False  False          -14.0  \n",
+       "309  False  False  False           -6.0  "
+      ]
+     },
+     "execution_count": 106,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# As we are focusing on the price I am going to create 2 new columns, one with the consume per unit of distance and the other one with a price.\n",
+    "# I dont have any data of the dates this services were delivered and thus I can not inffer how much did it cost to deliver them, however,\n",
+    "# I am going to use to prices of this gas types today (17/03/2022): SP98 1.955€/l, E10 1.825€/l\t\n",
+    "# Source: https://www.dieselogasolina.com/\n",
+    "df.sample(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 107,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# I have also created a little function to get the price in real time, might be useful for future uses of this code,\n",
+    "def find_price():\n",
+    "    page = requests.get('https://www.dieselogasolina.com/')\n",
+    "    soup = BeautifulSoup(page.content, 'html.parser')\n",
+    "    table = soup.find('table').find_all('tr')\n",
+    "    p_E10 = table[1].find_all('td')[1].text\n",
+    "    p_SP98 = table[2].find_all('td')[1].text\n",
+    "    return float(p_SP98[:5].replace(',','.')), float(p_E10[:5].replace(',','.'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 108,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Let's start by creating a function to transform the dataframe:\n",
+    "def df_trans(dis, con, gt, prices = find_price()): # Where x is a row in our dataframe, datos de https://www.dieselogasolina.com/\n",
+    "    \"\"\" This function is aimed to transform the dataframe and create 2 new columns with the consume per unit of distance and the price per unit\n",
+    "    of distance, it would also be easy to include a column with the total price (distance*price/distance). It should be used as:\n",
+    "                                df[new_columns] = df.apply(lambda x: df_trans(x.distance, x.consume,x.speed,prices=[...]), axis=1)\n",
+    "    It requires an argument with the prices of the gas we are using: [price_SP98, price_E10]\"\"\"\n",
+    "    #print(gt)\n",
+    "    cpd = con/dis # The consume per distance is the first thing we can calculate as it doesn't require any transformation\n",
+    "\n",
+    "\n",
+    "    price = dict({'SP98':prices[0], 'E10':prices[1]})\n",
+    "\n",
+    "    ppd = price[gt]*cpd # We can already know the price per distance\n",
+    "\n",
+    "    return pd.Series({'con_dis':cpd, 'price_dis':ppd})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 109,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fp = find_price()\n",
+    "df['gas_price'] = df.gas_type.apply(lambda x: fp[0] if x == 'SP98' else fp[1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 110,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df[['con_dis', 'price_dis']] = df.apply(lambda x:df_trans(x.distance,x.consume,x.gas_type), axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 111,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Now that we have transformed the Dataframe let's save it and work with it in another jupyter,\n",
+    "df.to_csv('../Data/cleaned_mes.csv', index=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "4034f9195f5552b4454ef60198efa491d941068725cfe9b8182a5b0158f58c43"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.8.12 ('ironhack')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.12"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}