import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from keras import Input
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.layers.core import Dense, Activation, Dropout
LSTM multivariate prediction
Import python packages:
= pd.read_csv('data/clean/final_covid_data.csv')
data_covid data_covid
provincia | fecha | num_casos | num_casos_prueba_pcr | num_casos_prueba_test_ac | num_casos_prueba_ag | num_casos_prueba_elisa | num_casos_prueba_desconocida | num_hosp | num_uci | ... | ws | ws_max | sol | mob_grocery_pharmacy | mob_parks | mob_residential | mob_retail_recreation | mob_transit_stations | mob_workplaces | mob_flujo | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Barcelona | 2020-01-01 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 2.5 | 7.2 | 4.9 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | Madrid | 2020-01-01 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0.8 | 3.6 | 8.3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | Málaga | 2020-01-01 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 3.3 | 6.7 | 7.7 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | Asturias | 2020-01-01 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 2.5 | 7.8 | 7.9 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | Sevilla | 2020-01-01 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 1.9 | 5.8 | 9.1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
4090 | Barcelona | 2022-03-29 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 7.2 | 13.3 | 0.0 | 0.0 | -13.0 | 5.0 | -24.0 | -16.0 | -17.0 | NaN |
4091 | Madrid | 2022-03-29 | 6 | 3 | 0 | 3 | 0 | 0 | 0 | 0 | ... | 2.2 | 6.1 | 2.4 | 1.0 | -11.0 | 4.0 | -25.0 | -16.0 | -16.0 | NaN |
4092 | Málaga | 2022-03-29 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 4.2 | 10.8 | 1.5 | 4.0 | -3.0 | 4.0 | -16.0 | 1.0 | -8.0 | NaN |
4093 | Asturias | 2022-03-29 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 2.2 | 6.7 | 0.0 | -4.0 | 17.0 | 3.0 | -25.0 | -15.0 | -12.0 | NaN |
4094 | Sevilla | 2022-03-29 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 2.5 | 8.3 | 1.6 | 3.0 | -7.0 | 2.0 | -15.0 | -13.0 | -5.0 | NaN |
4095 rows × 26 columns
All the available data
Asturias
= data_covid.loc[data_covid['provincia'] == 'Asturias']
data_asturias = data_asturias.set_index('fecha')
data_asturias = data_asturias['2020-06-14':]
data_asturias = data_asturias.filter(['num_casos', 'tmed', 'mob_grocery_pharmacy',
data_asturias 'mob_parks', 'mob_residential', 'mob_residential', 'mob_transit_stations', 'mob_workplaces'])
data_asturias
num_casos | tmed | mob_grocery_pharmacy | mob_parks | mob_residential | mob_residential | mob_transit_stations | mob_workplaces | |
---|---|---|---|---|---|---|---|---|
fecha | ||||||||
2020-06-14 | 0 | 16.8 | -8.0 | 6.0 | 4.0 | 4.0 | -43.0 | -2.0 |
2020-06-15 | 0 | 16.0 | -6.0 | 39.0 | 7.0 | 7.0 | -38.0 | -27.0 |
2020-06-16 | 1 | 15.6 | -6.0 | 7.0 | 9.0 | 9.0 | -43.0 | -29.0 |
2020-06-17 | 0 | 15.6 | -7.0 | 20.0 | 8.0 | 8.0 | -42.0 | -28.0 |
2020-06-18 | 0 | 15.1 | -4.0 | 68.0 | 7.0 | 7.0 | -39.0 | -28.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
2022-03-25 | 244 | 11.6 | -2.0 | 24.0 | 2.0 | 2.0 | -13.0 | -12.0 |
2022-03-26 | 432 | 12.0 | -13.0 | 27.0 | 1.0 | 1.0 | -19.0 | -13.0 |
2022-03-27 | 1 | 14.2 | -7.5 | 33.0 | -1.0 | -1.0 | -13.0 | -11.0 |
2022-03-28 | 9 | 13.8 | -2.0 | 45.0 | 1.0 | 1.0 | -9.0 | -10.0 |
2022-03-29 | 0 | 11.7 | -4.0 | 17.0 | 3.0 | 3.0 | -15.0 | -12.0 |
654 rows × 8 columns
data_asturias.describe()
num_casos | tmed | mob_grocery_pharmacy | mob_parks | mob_residential | mob_residential | mob_transit_stations | mob_workplaces | |
---|---|---|---|---|---|---|---|---|
count | 654.00000 | 654.000000 | 654.00000 | 654.000000 | 654.000000 | 654.000000 | 654.000000 | 654.000000 |
mean | 312.11315 | 13.909098 | 3.83792 | 57.725535 | 3.695719 | 3.695719 | -12.614679 | -20.651376 |
std | 565.17985 | 4.198047 | 15.05797 | 68.024228 | 4.802763 | 4.802763 | 14.838430 | 12.754756 |
min | 0.00000 | 3.300000 | -87.00000 | -63.000000 | -8.000000 | -8.000000 | -67.000000 | -80.000000 |
25% | 43.50000 | 10.800000 | -0.50000 | 13.000000 | 1.000000 | 1.000000 | -22.000000 | -26.000000 |
50% | 117.00000 | 13.900000 | 5.00000 | 39.000000 | 3.000000 | 3.000000 | -13.000000 | -18.500000 |
75% | 323.75000 | 17.600000 | 10.00000 | 83.500000 | 6.000000 | 6.000000 | -4.000000 | -14.000000 |
max | 3827.00000 | 25.100000 | 42.00000 | 333.000000 | 26.000000 | 26.000000 | 26.000000 | 12.000000 |
= data_asturias.values np_data_asturias
# Train dataset
= MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit_transform(np_data_asturias)
scaled_data_asturias print(f'Longitud del conjunto de datos disponible: {len(scaled_data_asturias)}')
Longitud del conjunto de datos disponible: 654
# Since we are going to predict future values based on the 90 past elements,
# we need to create a list with those historic information for each element
= 90
historic_values = []
scaled_data_asturias_x = []
scaled_data_asturias_y
for i in range(historic_values, len(scaled_data_asturias)):
-historic_values):i, :])
scaled_data_asturias_x.append(scaled_data_asturias[(i0])
scaled_data_asturias_y.append(scaled_data_asturias[i,
# Convert the x_train and y_train to numpy arrays
= np.array(scaled_data_asturias_x)
scaled_data_asturias_x = np.array(scaled_data_asturias_y) scaled_data_asturias_y
# Once predicted, we are going to need a exclusive scaler for num_cases
= MinMaxScaler(feature_range=(0, 1))
scaler_pred = pd.DataFrame(data_asturias['num_casos'])
df_cases_asturias = scaler_pred.fit_transform(df_cases_asturias) scaled_data_asturias_pred
# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_asturias_y)}')
Longitud datos de entrenamiento con historico: 564
# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 90 days
= scaled_data_asturias_x[0:len(scaled_data_asturias_x)-91]
x_train = scaled_data_asturias_y[0:len(scaled_data_asturias_y)-91]
y_train print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')
= scaled_data_asturias_x[len(scaled_data_asturias_x)-90:len(scaled_data_asturias_x)]
x_test = scaled_data_asturias_y[len(scaled_data_asturias_y)-90:len(scaled_data_asturias_y)]
y_test print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')
Cantidad datos de entrenamiento: x=473 - y=473
Cantidad datos de test: x=90 - y=90
# Reshape the data to feed de recurrent network
= np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 8))
x_train print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
= np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 8))
x_test print("Test data shape:")
print(x_test.shape)
print(y_test.shape)
Train data shape:
(473, 90, 8)
(473,)
Test data shape:
(90, 90, 8)
(90,)
# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
= Sequential()
model # Model with Neurons
# Inputshape = neurons -> Timestamps
= x_train.shape[1]
neurons90,
model.add(LSTM(= 'relu',
activation = True,
return_sequences = (x_train.shape[1], 8)))
input_shape 50,
model.add(LSTM(= 'relu',
activation = True))
return_sequences 25,
model.add(LSTM(= 'relu',
activation = False))
return_sequences 5, activation = 'relu'))
model.add(Dense(1)) model.add(Dense(
Build model...
2022-05-25 01:28:48.052818: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
compile(optimizer='adam', loss='mean_squared_error')
model. model.summary()
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm (LSTM) (None, 90, 90) 35640
lstm_1 (LSTM) (None, 90, 50) 28200
lstm_2 (LSTM) (None, 25) 7600
dense (Dense) (None, 5) 130
dense_1 (Dense) (None, 1) 6
=================================================================
Total params: 71,576
Trainable params: 71,576
Non-trainable params: 0
_________________________________________________________________
# Training the model
# fit network
= model.fit(x_train,
history
y_train, =1000,
batch_size=30,
epochs= (x_test, y_test),
validation_data = 0) verbose
'loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.plot(history.history[
plt.legend() plt.show()
# Get the predicted values
= model.predict(x_test)
predictions = scaler_pred.inverse_transform(predictions) predictions
1/3 [=========>....................] - ETA: 0s
3/3 [==============================] - ETA: 0s
3/3 [==============================] - 0s 30ms/step
= y_test.reshape(-1,1)
y_test = scaler_pred.inverse_transform(y_test) y_test
# Calculate the mean absolute error (MAE)
= mean_absolute_error(y_test, predictions)
mae print('MAE: ' + str(round(mae, 1)))
# Calculate the root mean squarred error (RMSE)
= np.sqrt(mean_squared_error(y_test,predictions))
rmse print('RMSE: ' + str(round(rmse, 1)))
# Calculate the root mean squarred error (RMSE)
= mean_squared_error(y_test,
rmse
predictions,= False)
squared print('RMSE: ' + str(round(rmse, 1)))
MAE: 738.5
RMSE: 1169.8
RMSE: 1169.8
# Add the difference between the valid and predicted prices
= data_asturias[:(len(x_train)+92)]
train = data_asturias[(len(x_train)+91):] valid
1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True) valid.insert(
# Zoom-in to a closer timeframe
# Date from which on the date is displayed
= "2021-10-31"
display_start_date = valid[valid.index > display_start_date]
valid = train[train.index > display_start_date] train
# Visualize the data
'ggplot')
matplotlib.style.use(= plt.subplots(figsize=(22, 10), sharex=True)
fig, ax1
# Data - Train
= train.index;
xt = train[["num_casos"]]
yt # Data - Test / validation
= valid.index;
xv = valid[["num_casos", "Predictions"]]
yv
# Plot
"Asturias: Predictions vs Real infections", fontsize=20)
plt.title("Nº Cases", fontsize=18)
plt.ylabel(
="blue", linewidth=1.5)
plt.plot(yt, color"Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.plot(yv["Train", "LSTM Predictions", "Test"],
plt.legend([="upper left", fontsize=18)
loc
# Bar plot with the differences
= valid.index
x = valid["Difference"]
y =0.2, color="grey")
plt.bar(x, y, width
plt.grid() plt.show()
Barcelona
= data_covid.loc[data_covid['provincia'] == 'Barcelona']
data_barcelona = data_barcelona.set_index('fecha')
data_barcelona = data_barcelona['2020-06-14':]
data_barcelona = data_barcelona.filter(['num_casos', 'tmed', 'mob_grocery_pharmacy',
data_barcelona 'mob_parks', 'mob_residential', 'mob_residential', 'mob_transit_stations', 'mob_workplaces'])
data_barcelona
num_casos | tmed | mob_grocery_pharmacy | mob_parks | mob_residential | mob_residential | mob_transit_stations | mob_workplaces | |
---|---|---|---|---|---|---|---|---|
fecha | ||||||||
2020-06-14 | 33 | 20.6 | -20.0 | 15.0 | 6.0 | 6.0 | -31.0 | -6.0 |
2020-06-15 | 62 | 21.2 | -9.0 | 17.0 | 14.0 | 14.0 | -34.0 | -41.0 |
2020-06-16 | 66 | 20.8 | -10.0 | -16.0 | 16.0 | 16.0 | -39.0 | -41.0 |
2020-06-17 | 70 | 20.3 | -4.0 | 14.0 | 15.0 | 15.0 | -33.0 | -40.0 |
2020-06-18 | 68 | 18.8 | -8.0 | -6.0 | 16.0 | 16.0 | -38.0 | -41.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
2022-03-25 | 598 | 14.4 | 3.0 | -11.0 | 4.0 | 4.0 | -18.0 | -16.0 |
2022-03-26 | 345 | 12.8 | -11.0 | -33.0 | 3.0 | 3.0 | -22.0 | -12.0 |
2022-03-27 | 252 | 15.6 | -10.0 | -3.0 | 0.0 | 0.0 | -12.0 | -10.0 |
2022-03-28 | 688 | 14.0 | -1.0 | -2.0 | 4.0 | 4.0 | -15.0 | -15.0 |
2022-03-29 | 0 | 13.0 | 0.0 | -13.0 | 5.0 | 5.0 | -16.0 | -17.0 |
654 rows × 8 columns
data_barcelona.describe()
num_casos | tmed | mob_grocery_pharmacy | mob_parks | mob_residential | mob_residential | mob_transit_stations | mob_workplaces | |
---|---|---|---|---|---|---|---|---|
count | 654.000000 | 654.000000 | 654.000000 | 654.000000 | 654.000000 | 654.000000 | 654.000000 | 654.000000 |
mean | 2605.048930 | 17.294801 | 0.180428 | -5.547401 | 7.178899 | 7.178899 | -24.041284 | -27.391437 |
std | 4823.001644 | 6.163636 | 16.568328 | 15.365483 | 5.019727 | 5.019727 | 11.241460 | 15.949374 |
min | 0.000000 | 4.300000 | -83.000000 | -76.000000 | -6.000000 | -6.000000 | -72.000000 | -87.000000 |
25% | 597.250000 | 12.200000 | -7.000000 | -15.000000 | 4.000000 | 4.000000 | -32.000000 | -34.000000 |
50% | 1030.500000 | 16.300000 | 2.000000 | -5.000000 | 7.000000 | 7.000000 | -24.000000 | -25.000000 |
75% | 2243.750000 | 23.600000 | 10.000000 | 6.000000 | 10.000000 | 10.000000 | -15.000000 | -18.000000 |
max | 34701.000000 | 28.400000 | 58.000000 | 33.000000 | 32.000000 | 32.000000 | -2.000000 | 6.000000 |
= data_barcelona.values np_data_barcelona
# Train dataset
= MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit_transform(np_data_barcelona)
scaled_data_barcelona print(f'Longitud del conjunto de datos disponible: {len(scaled_data_barcelona)}')
Longitud del conjunto de datos disponible: 654
# Since we are going to predict future values based on the 90 past elements,
# we need to create a list with those historic information for each element
= 90
historic_values = []
scaled_data_barcelona_x = []
scaled_data_barcelona_y
for i in range(historic_values, len(scaled_data_barcelona)):
-historic_values):i, :])
scaled_data_barcelona_x.append(scaled_data_barcelona[(i0])
scaled_data_barcelona_y.append(scaled_data_barcelona[i,
# Convert the x_train and y_train to numpy arrays
= np.array(scaled_data_barcelona_x)
scaled_data_barcelona_x = np.array(scaled_data_barcelona_y) scaled_data_barcelona_y
# Once predicted, we are going to need a exclusive scaler for num_cases
= MinMaxScaler(feature_range=(0, 1))
scaler_pred = pd.DataFrame(data_barcelona['num_casos'])
df_cases_barcelona = scaler_pred.fit_transform(df_cases_barcelona) scaled_data_barcelona_pred
# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_barcelona_y)}')
Longitud datos de entrenamiento con historico: 564
# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 90 days
= scaled_data_barcelona_x[0:len(scaled_data_barcelona_x)-91]
x_train = scaled_data_barcelona_y[0:len(scaled_data_barcelona_y)-91]
y_train print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')
= scaled_data_barcelona_x[len(scaled_data_barcelona_x)-90:len(scaled_data_barcelona_x)]
x_test = scaled_data_barcelona_y[len(scaled_data_barcelona_y)-90:len(scaled_data_barcelona_y)]
y_test print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')
Cantidad datos de entrenamiento: x=473 - y=473
Cantidad datos de test: x=90 - y=90
# Reshape the data to feed de recurrent network
= np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 8))
x_train print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
= np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 8))
x_test print("Test data shape:")
print(x_test.shape)
print(y_test.shape)
Train data shape:
(473, 90, 8)
(473,)
Test data shape:
(90, 90, 8)
(90,)
# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
= Sequential()
model # Model with Neurons
# Inputshape = neurons -> Timestamps
= x_train.shape[1]
neurons90,
model.add(LSTM(= 'relu',
activation = True,
return_sequences = (x_train.shape[1], 8)))
input_shape 50,
model.add(LSTM(= 'relu',
activation = True))
return_sequences 25,
model.add(LSTM(= 'relu',
activation = False))
return_sequences 5, activation = 'relu'))
model.add(Dense(1)) model.add(Dense(
Build model...
compile(optimizer='adam', loss='mean_squared_error')
model. model.summary()
Model: "sequential_1"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm_3 (LSTM) (None, 90, 90) 35640
lstm_4 (LSTM) (None, 90, 50) 28200
lstm_5 (LSTM) (None, 25) 7600
dense_2 (Dense) (None, 5) 130
dense_3 (Dense) (None, 1) 6
=================================================================
Total params: 71,576
Trainable params: 71,576
Non-trainable params: 0
_________________________________________________________________
# Training the model
# fit network
= model.fit(x_train,
history
y_train, =1000,
batch_size=30,
epochs= (x_test, y_test),
validation_data = 0) verbose
'loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.plot(history.history[
plt.legend() plt.show()
# Get the predicted values
= model.predict(x_test)
predictions = scaler_pred.inverse_transform(predictions) predictions
1/3 [=========>....................] - ETA: 0s
3/3 [==============================] - ETA: 0s
3/3 [==============================] - 0s 31ms/step
= y_test.reshape(-1,1)
y_test = scaler_pred.inverse_transform(y_test) y_test
# Calculate the mean absolute error (MAE)
= mean_absolute_error(y_test, predictions)
mae print('MAE: ' + str(round(mae, 1)))
# Calculate the root mean squarred error (RMSE)
= np.sqrt(mean_squared_error(y_test,predictions))
rmse print('RMSE: ' + str(round(rmse, 1)))
# Calculate the root mean squarred error (RMSE)
= mean_squared_error(y_test,
rmse
predictions,= False)
squared print('RMSE: ' + str(round(rmse, 1)))
MAE: 6872.2
RMSE: 11041.0
RMSE: 11041.0
# Add the difference between the valid and predicted prices
= data_barcelona[:(len(x_train)+92)]
train = data_barcelona[(len(x_train)+91):] valid
1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True) valid.insert(
# Zoom-in to a closer timeframe
# Date from which on the date is displayed
= "2021-10-31"
display_start_date = valid[valid.index > display_start_date]
valid = train[train.index > display_start_date] train
# Visualize the data
'ggplot')
matplotlib.style.use(= plt.subplots(figsize=(22, 10), sharex=True)
fig, ax1
# Data - Train
= train.index;
xt = train[["num_casos"]]
yt # Data - Test / validation
= valid.index;
xv = valid[["num_casos", "Predictions"]]
yv
# Plot
"barcelona: Predictions vs Real infections", fontsize=20)
plt.title("Nº Cases", fontsize=18)
plt.ylabel(
="blue", linewidth=1.5)
plt.plot(yt, color"Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.plot(yv["Train", "LSTM Predictions", "Test"],
plt.legend([="upper left", fontsize=18)
loc
# Bar plot with the differences
= valid.index
x = valid["Difference"]
y =0.2, color="grey")
plt.bar(x, y, width
plt.grid() plt.show()
Madrid
= data_covid.loc[data_covid['provincia'] == 'Madrid']
data_madrid = data_madrid.set_index('fecha')
data_madrid = data_madrid['2020-06-14':]
data_madrid = data_madrid.filter(['num_casos', 'tmed', 'mob_grocery_pharmacy',
data_madrid 'mob_parks', 'mob_residential', 'mob_residential', 'mob_transit_stations', 'mob_workplaces'])
data_madrid
num_casos | tmed | mob_grocery_pharmacy | mob_parks | mob_residential | mob_residential | mob_transit_stations | mob_workplaces | |
---|---|---|---|---|---|---|---|---|
fecha | ||||||||
2020-06-14 | 81 | 19.0 | -15.0 | 25.0 | 7.0 | 7.0 | -43.0 | -6.0 |
2020-06-15 | 153 | 20.7 | -8.0 | 34.0 | 16.0 | 16.0 | -43.0 | -47.0 |
2020-06-16 | 91 | 21.0 | -8.0 | 21.0 | 17.0 | 17.0 | -43.0 | -47.0 |
2020-06-17 | 93 | 20.2 | -5.0 | 34.0 | 17.0 | 17.0 | -42.0 | -47.0 |
2020-06-18 | 85 | 21.9 | -7.0 | 30.0 | 16.0 | 16.0 | -44.0 | -47.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
2022-03-25 | 356 | 11.0 | -1.0 | -37.0 | 5.0 | 5.0 | -19.0 | -17.0 |
2022-03-26 | 303 | 10.4 | -7.0 | -27.0 | 1.0 | 1.0 | -15.0 | -5.0 |
2022-03-27 | 77 | 10.6 | -3.0 | -17.0 | 0.0 | 0.0 | -17.0 | -2.0 |
2022-03-28 | 839 | 10.2 | 2.0 | -3.0 | 4.0 | 4.0 | -15.0 | -17.0 |
2022-03-29 | 6 | 13.0 | 1.0 | -11.0 | 4.0 | 4.0 | -16.0 | -16.0 |
654 rows × 8 columns
data_madrid.describe()
num_casos | tmed | mob_grocery_pharmacy | mob_parks | mob_residential | mob_residential | mob_transit_stations | mob_workplaces | |
---|---|---|---|---|---|---|---|---|
count | 654.000000 | 654.000000 | 654.000000 | 654.000000 | 654.000000 | 654.000000 | 654.000000 | 654.000000 |
mean | 2392.562691 | 14.909786 | -2.608563 | -4.874618 | 6.978593 | 6.978593 | -30.003058 | -27.324159 |
std | 3390.272836 | 8.073496 | 15.108581 | 20.826186 | 5.672763 | 5.672763 | 12.389295 | 18.988563 |
min | 6.000000 | -6.200000 | -84.000000 | -62.000000 | -6.000000 | -6.000000 | -73.000000 | -86.000000 |
25% | 662.750000 | 8.600000 | -9.000000 | -17.750000 | 4.000000 | 4.000000 | -39.000000 | -40.000000 |
50% | 1413.000000 | 13.200000 | -1.000000 | -4.000000 | 7.000000 | 7.000000 | -31.000000 | -26.000000 |
75% | 2475.750000 | 21.800000 | 8.000000 | 11.000000 | 10.000000 | 10.000000 | -21.000000 | -16.000000 |
max | 23811.000000 | 32.700000 | 50.000000 | 77.000000 | 31.000000 | 31.000000 | 1.000000 | 20.000000 |
= data_madrid.values np_data_madrid
# Train dataset
= MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit_transform(np_data_madrid)
scaled_data_madrid print(f'Longitud del conjunto de datos disponible: {len(scaled_data_madrid)}')
Longitud del conjunto de datos disponible: 654
# Since we are going to predict future values based on the 90 past elements,
# we need to create a list with those historic information for each element
= 90
historic_values = []
scaled_data_madrid_x = []
scaled_data_madrid_y
for i in range(historic_values, len(scaled_data_madrid)):
-historic_values):i, :])
scaled_data_madrid_x.append(scaled_data_madrid[(i0])
scaled_data_madrid_y.append(scaled_data_madrid[i,
# Convert the x_train and y_train to numpy arrays
= np.array(scaled_data_madrid_x)
scaled_data_madrid_x = np.array(scaled_data_madrid_y) scaled_data_madrid_y
# Once predicted, we are going to need a exclusive scaler for num_cases
= MinMaxScaler(feature_range=(0, 1))
scaler_pred = pd.DataFrame(data_madrid['num_casos'])
df_cases_madrid = scaler_pred.fit_transform(df_cases_madrid) scaled_data_madrid_pred
# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_madrid_y)}')
Longitud datos de entrenamiento con historico: 564
# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 90 days
= scaled_data_madrid_x[0:len(scaled_data_madrid_x)-91]
x_train = scaled_data_madrid_y[0:len(scaled_data_madrid_y)-91]
y_train print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')
= scaled_data_madrid_x[len(scaled_data_madrid_x)-90:len(scaled_data_madrid_x)]
x_test = scaled_data_madrid_y[len(scaled_data_madrid_y)-90:len(scaled_data_madrid_y)]
y_test print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')
Cantidad datos de entrenamiento: x=473 - y=473
Cantidad datos de test: x=90 - y=90
# Reshape the data to feed de recurrent network
= np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 8))
x_train print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
= np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 8))
x_test print("Test data shape:")
print(x_test.shape)
print(y_test.shape)
Train data shape:
(473, 90, 8)
(473,)
Test data shape:
(90, 90, 8)
(90,)
# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
= Sequential()
model # Model with Neurons
# Inputshape = neurons -> Timestamps
= x_train.shape[1]
neurons90,
model.add(LSTM(= 'relu',
activation = True,
return_sequences = (x_train.shape[1], 8)))
input_shape 50,
model.add(LSTM(= 'relu',
activation = True))
return_sequences 25,
model.add(LSTM(= 'relu',
activation = False))
return_sequences 5, activation = 'relu'))
model.add(Dense(1)) model.add(Dense(
Build model...
compile(optimizer='adam', loss='mean_squared_error')
model. model.summary()
Model: "sequential_2"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm_6 (LSTM) (None, 90, 90) 35640
lstm_7 (LSTM) (None, 90, 50) 28200
lstm_8 (LSTM) (None, 25) 7600
dense_4 (Dense) (None, 5) 130
dense_5 (Dense) (None, 1) 6
=================================================================
Total params: 71,576
Trainable params: 71,576
Non-trainable params: 0
_________________________________________________________________
# Training the model
# fit network
= model.fit(x_train,
history
y_train, =1000,
batch_size=30,
epochs= (x_test, y_test),
validation_data = 0) verbose
'loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.plot(history.history[
plt.legend() plt.show()
# Get the predicted values
= model.predict(x_test)
predictions = scaler_pred.inverse_transform(predictions) predictions
1/3 [=========>....................] - ETA: 0s
3/3 [==============================] - ETA: 0s
3/3 [==============================] - 0s 30ms/step
= y_test.reshape(-1,1)
y_test = scaler_pred.inverse_transform(y_test) y_test
# Calculate the mean absolute error (MAE)
= mean_absolute_error(y_test, predictions)
mae print('MAE: ' + str(round(mae, 1)))
# Calculate the root mean squarred error (RMSE)
= np.sqrt(mean_squared_error(y_test,predictions))
rmse print('RMSE: ' + str(round(rmse, 1)))
# Calculate the root mean squarred error (RMSE)
= mean_squared_error(y_test,
rmse
predictions,= False)
squared print('RMSE: ' + str(round(rmse, 1)))
MAE: 4531.8
RMSE: 7047.1
RMSE: 7047.1
# Add the difference between the valid and predicted prices
= data_madrid[:(len(x_train)+92)]
train = data_madrid[(len(x_train)+91):] valid
1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True) valid.insert(
# Zoom-in to a closer timeframe
# Date from which on the date is displayed
= "2021-10-31"
display_start_date = valid[valid.index > display_start_date]
valid = train[train.index > display_start_date] train
# Visualize the data
'ggplot')
matplotlib.style.use(= plt.subplots(figsize=(22, 10), sharex=True)
fig, ax1
# Data - Train
= train.index;
xt = train[["num_casos"]]
yt # Data - Test / validation
= valid.index;
xv = valid[["num_casos", "Predictions"]]
yv
# Plot
"madrid: Predictions vs Real infections", fontsize=20)
plt.title("Nº Cases", fontsize=18)
plt.ylabel(
="blue", linewidth=1.5)
plt.plot(yt, color"Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.plot(yv["Train", "LSTM Predictions", "Test"],
plt.legend([="upper left", fontsize=18)
loc
# Bar plot with the differences
= valid.index
x = valid["Difference"]
y =0.2, color="grey")
plt.bar(x, y, width
plt.grid() plt.show()
Málaga
= data_covid.loc[data_covid['provincia'] == 'Málaga']
data_malaga = data_malaga.set_index('fecha')
data_malaga = data_malaga['2020-06-14':]
data_malaga = data_malaga.filter(['num_casos', 'tmed', 'mob_grocery_pharmacy',
data_malaga 'mob_parks', 'mob_residential', 'mob_residential', 'mob_transit_stations', 'mob_workplaces'])
data_malaga
num_casos | tmed | mob_grocery_pharmacy | mob_parks | mob_residential | mob_residential | mob_transit_stations | mob_workplaces | |
---|---|---|---|---|---|---|---|---|
fecha | ||||||||
2020-06-14 | 2 | 26.6 | -20.0 | -15.0 | 2.0 | 2.0 | -53.0 | -7.0 |
2020-06-15 | 1 | 27.2 | -16.0 | -9.0 | 9.0 | 9.0 | -44.0 | -33.0 |
2020-06-16 | 1 | 27.8 | -13.0 | -4.0 | 9.0 | 9.0 | -40.0 | -33.0 |
2020-06-17 | 2 | 26.9 | -13.0 | 1.0 | 9.0 | 9.0 | -41.0 | -33.0 |
2020-06-18 | 2 | 21.6 | -12.0 | 3.0 | 9.0 | 9.0 | -42.0 | -33.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
2022-03-25 | 565 | 13.8 | -3.0 | -32.0 | 8.0 | 8.0 | -18.0 | -12.0 |
2022-03-26 | 79 | 14.5 | -1.0 | -1.0 | 3.0 | 3.0 | -13.0 | -4.0 |
2022-03-27 | 65 | 15.4 | 5.0 | -18.0 | 2.0 | 2.0 | 0.0 | 0.0 |
2022-03-28 | 39 | 16.3 | 6.0 | -4.0 | 3.0 | 3.0 | -4.0 | -7.0 |
2022-03-29 | 0 | 16.8 | 4.0 | -3.0 | 4.0 | 4.0 | 1.0 | -8.0 |
654 rows × 8 columns
data_malaga.describe()
num_casos | tmed | mob_grocery_pharmacy | mob_parks | mob_residential | mob_residential | mob_transit_stations | mob_workplaces | |
---|---|---|---|---|---|---|---|---|
count | 654.000000 | 654.000000 | 654.000000 | 654.000000 | 654.000000 | 654.000000 | 654.000000 | 654.00000 |
mean | 410.206422 | 19.511162 | 8.776758 | 13.570336 | 4.529052 | 4.529052 | -18.853211 | -18.04893 |
std | 489.452348 | 5.721942 | 26.696909 | 39.148014 | 4.064365 | 4.064365 | 18.183256 | 14.55877 |
min | 0.000000 | 6.200000 | -80.000000 | -63.000000 | -4.000000 | -4.000000 | -70.000000 | -79.00000 |
25% | 122.250000 | 14.725000 | -4.000000 | -13.000000 | 2.000000 | 2.000000 | -32.000000 | -26.00000 |
50% | 215.500000 | 18.400000 | 6.000000 | 5.000000 | 4.000000 | 4.000000 | -21.000000 | -17.00000 |
75% | 475.250000 | 24.500000 | 15.000000 | 30.000000 | 6.750000 | 6.750000 | -4.000000 | -11.00000 |
max | 3080.000000 | 34.500000 | 156.000000 | 152.000000 | 22.000000 | 22.000000 | 20.000000 | 29.00000 |
= data_malaga.values np_data_malaga
# Train dataset
= MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit_transform(np_data_malaga)
scaled_data_malaga print(f'Longitud del conjunto de datos disponible: {len(scaled_data_malaga)}')
Longitud del conjunto de datos disponible: 654
# Since we are going to predict future values based on the 90 past elements,
# we need to create a list with those historic information for each element
= 90
historic_values = []
scaled_data_malaga_x = []
scaled_data_malaga_y
for i in range(historic_values, len(scaled_data_malaga)):
-historic_values):i, :])
scaled_data_malaga_x.append(scaled_data_malaga[(i0])
scaled_data_malaga_y.append(scaled_data_malaga[i,
# Convert the x_train and y_train to numpy arrays
= np.array(scaled_data_malaga_x)
scaled_data_malaga_x = np.array(scaled_data_malaga_y) scaled_data_malaga_y
# Once predicted, we are going to need a exclusive scaler for num_cases
= MinMaxScaler(feature_range=(0, 1))
scaler_pred = pd.DataFrame(data_malaga['num_casos'])
df_cases_malaga = scaler_pred.fit_transform(df_cases_malaga) scaled_data_malaga_pred
# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_malaga_y)}')
Longitud datos de entrenamiento con historico: 564
# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 90 days
= scaled_data_malaga_x[0:len(scaled_data_malaga_x)-91]
x_train = scaled_data_malaga_y[0:len(scaled_data_malaga_y)-91]
y_train print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')
= scaled_data_malaga_x[len(scaled_data_malaga_x)-90:len(scaled_data_malaga_x)]
x_test = scaled_data_malaga_y[len(scaled_data_malaga_y)-90:len(scaled_data_malaga_y)]
y_test print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')
Cantidad datos de entrenamiento: x=473 - y=473
Cantidad datos de test: x=90 - y=90
# Reshape the data to feed de recurrent network
= np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 8))
x_train print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
= np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 8))
x_test print("Test data shape:")
print(x_test.shape)
print(y_test.shape)
Train data shape:
(473, 90, 8)
(473,)
Test data shape:
(90, 90, 8)
(90,)
# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
= Sequential()
model # Model with Neurons
# Inputshape = neurons -> Timestamps
= x_train.shape[1]
neurons90,
model.add(LSTM(= 'relu',
activation = True,
return_sequences = (x_train.shape[1], 8)))
input_shape 50,
model.add(LSTM(= 'relu',
activation = True))
return_sequences 25,
model.add(LSTM(= 'relu',
activation = False))
return_sequences 5, activation = 'relu'))
model.add(Dense(1)) model.add(Dense(
Build model...
compile(optimizer='adam', loss='mean_squared_error')
model. model.summary()
Model: "sequential_3"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm_9 (LSTM) (None, 90, 90) 35640
lstm_10 (LSTM) (None, 90, 50) 28200
lstm_11 (LSTM) (None, 25) 7600
dense_6 (Dense) (None, 5) 130
dense_7 (Dense) (None, 1) 6
=================================================================
Total params: 71,576
Trainable params: 71,576
Non-trainable params: 0
_________________________________________________________________
# Training the model
# fit network
= model.fit(x_train,
history
y_train, =1000,
batch_size=30,
epochs= (x_test, y_test),
validation_data = 0) verbose
'loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.plot(history.history[
plt.legend() plt.show()
# Get the predicted values
= model.predict(x_test)
predictions = scaler_pred.inverse_transform(predictions) predictions
1/3 [=========>....................] - ETA: 0s
3/3 [==============================] - ETA: 0s
3/3 [==============================] - 0s 31ms/step
= y_test.reshape(-1,1)
y_test = scaler_pred.inverse_transform(y_test) y_test
# Calculate the mean absolute error (MAE)
= mean_absolute_error(y_test, predictions)
mae print('MAE: ' + str(round(mae, 1)))
# Calculate the root mean squarred error (RMSE)
= np.sqrt(mean_squared_error(y_test,predictions))
rmse print('RMSE: ' + str(round(rmse, 1)))
# Calculate the root mean squarred error (RMSE)
= mean_squared_error(y_test,
rmse
predictions,= False)
squared print('RMSE: ' + str(round(rmse, 1)))
MAE: 322.4
RMSE: 508.6
RMSE: 508.6
# Add the difference between the valid and predicted prices
= data_malaga[:(len(x_train)+92)]
train = data_malaga[(len(x_train)+91):] valid
1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True) valid.insert(
# Zoom-in to a closer timeframe
# Date from which on the date is displayed
= "2021-10-31"
display_start_date = valid[valid.index > display_start_date]
valid = train[train.index > display_start_date] train
# Visualize the data
'ggplot')
matplotlib.style.use(= plt.subplots(figsize=(22, 10), sharex=True)
fig, ax1
# Data - Train
= train.index;
xt = train[["num_casos"]]
yt # Data - Test / validation
= valid.index;
xv = valid[["num_casos", "Predictions"]]
yv
# Plot
"malaga: Predictions vs Real infections", fontsize=20)
plt.title("Nº Cases", fontsize=18)
plt.ylabel(
="blue", linewidth=1.5)
plt.plot(yt, color"Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.plot(yv["Train", "LSTM Predictions", "Test"],
plt.legend([="upper left", fontsize=18)
loc
# Bar plot with the differences
= valid.index
x = valid["Difference"]
y =0.2, color="grey")
plt.bar(x, y, width
plt.grid() plt.show()
Sevilla
= data_covid.loc[data_covid['provincia'] == 'Sevilla']
data_sevilla = data_sevilla.set_index('fecha')
data_sevilla = data_sevilla['2020-06-14':]
data_sevilla = data_sevilla.filter(['num_casos', 'tmed', 'mob_grocery_pharmacy',
data_sevilla 'mob_parks', 'mob_residential', 'mob_residential', 'mob_transit_stations', 'mob_workplaces'])
data_sevilla
num_casos | tmed | mob_grocery_pharmacy | mob_parks | mob_residential | mob_residential | mob_transit_stations | mob_workplaces | |
---|---|---|---|---|---|---|---|---|
fecha | ||||||||
2020-06-14 | 0 | 22.200000 | -24.0 | -36.0 | 0.0 | 0.0 | -51.0 | 0.0 |
2020-06-15 | 2 | 23.088889 | -14.0 | -13.0 | 9.0 | 9.0 | -41.0 | -33.0 |
2020-06-16 | 1 | 23.977778 | -10.0 | -4.0 | 9.0 | 9.0 | -37.0 | -32.0 |
2020-06-17 | 0 | 24.866667 | -10.0 | -7.0 | 9.0 | 9.0 | -37.0 | -32.0 |
2020-06-18 | 0 | 25.755556 | -12.0 | -13.0 | 9.0 | 9.0 | -41.0 | -33.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
2022-03-25 | 365 | 14.100000 | 2.0 | -20.0 | 4.0 | 4.0 | -14.0 | -7.0 |
2022-03-26 | 67 | 15.200000 | -9.0 | 2.0 | 0.0 | 0.0 | -12.0 | -5.0 |
2022-03-27 | 24 | 15.900000 | 2.0 | -13.0 | -1.0 | -1.0 | -10.0 | -7.0 |
2022-03-28 | 12 | 17.100000 | 2.0 | -5.0 | 2.0 | 2.0 | -10.0 | -6.0 |
2022-03-29 | 0 | 16.000000 | 3.0 | -7.0 | 2.0 | 2.0 | -13.0 | -5.0 |
654 rows × 8 columns
data_sevilla.describe()
num_casos | tmed | mob_grocery_pharmacy | mob_parks | mob_residential | mob_residential | mob_transit_stations | mob_workplaces | |
---|---|---|---|---|---|---|---|---|
count | 654.000000 | 654.000000 | 654.000000 | 654.000000 | 654.000000 | 654.000000 | 654.000000 | 654.000000 |
mean | 432.978593 | 19.663609 | -0.709480 | -14.958716 | 3.915902 | 3.915902 | -25.707951 | -20.347095 |
std | 500.618517 | 6.949955 | 18.851349 | 17.982375 | 4.397963 | 4.397963 | 14.307026 | 14.434846 |
min | 0.000000 | 5.500000 | -79.000000 | -70.000000 | -7.000000 | -7.000000 | -71.000000 | -79.000000 |
25% | 127.250000 | 13.800000 | -8.000000 | -27.000000 | 1.000000 | 1.000000 | -36.000000 | -29.000000 |
50% | 282.000000 | 18.600000 | 0.000000 | -13.000000 | 4.000000 | 4.000000 | -26.000000 | -17.000000 |
75% | 528.250000 | 25.600000 | 7.000000 | -4.000000 | 6.000000 | 6.000000 | -15.000000 | -10.000000 |
max | 3692.000000 | 34.400000 | 117.000000 | 61.000000 | 22.000000 | 22.000000 | 16.000000 | 9.000000 |
= data_sevilla.values np_data_sevilla
# Train dataset
= MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit_transform(np_data_sevilla)
scaled_data_sevilla print(f'Longitud del conjunto de datos disponible: {len(scaled_data_sevilla)}')
Longitud del conjunto de datos disponible: 654
# Since we are going to predict future values based on the 90 past elements,
# we need to create a list with those historic information for each element
= 90
historic_values = []
scaled_data_sevilla_x = []
scaled_data_sevilla_y
for i in range(historic_values, len(scaled_data_sevilla)):
-historic_values):i, :])
scaled_data_sevilla_x.append(scaled_data_sevilla[(i0])
scaled_data_sevilla_y.append(scaled_data_sevilla[i,
# Convert the x_train and y_train to numpy arrays
= np.array(scaled_data_sevilla_x)
scaled_data_sevilla_x = np.array(scaled_data_sevilla_y) scaled_data_sevilla_y
# Once predicted, we are going to need a exclusive scaler for num_cases
= MinMaxScaler(feature_range=(0, 1))
scaler_pred = pd.DataFrame(data_sevilla['num_casos'])
df_cases_sevilla = scaler_pred.fit_transform(df_cases_sevilla) scaled_data_sevilla_pred
# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_sevilla_y)}')
Longitud datos de entrenamiento con historico: 564
# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 90 days
= scaled_data_sevilla_x[0:len(scaled_data_sevilla_x)-91]
x_train = scaled_data_sevilla_y[0:len(scaled_data_sevilla_y)-91]
y_train print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')
= scaled_data_sevilla_x[len(scaled_data_sevilla_x)-90:len(scaled_data_sevilla_x)]
x_test = scaled_data_sevilla_y[len(scaled_data_sevilla_y)-90:len(scaled_data_sevilla_y)]
y_test print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')
Cantidad datos de entrenamiento: x=473 - y=473
Cantidad datos de test: x=90 - y=90
# Reshape the data to feed de recurrent network
= np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 8))
x_train print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
= np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 8))
x_test print("Test data shape:")
print(x_test.shape)
print(y_test.shape)
Train data shape:
(473, 90, 8)
(473,)
Test data shape:
(90, 90, 8)
(90,)
# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
= Sequential()
model # Model with Neurons
# Inputshape = neurons -> Timestamps
= x_train.shape[1]
neurons90,
model.add(LSTM(= 'relu',
activation = True,
return_sequences = (x_train.shape[1], 8)))
input_shape 50,
model.add(LSTM(= 'relu',
activation = True))
return_sequences 25,
model.add(LSTM(= 'relu',
activation = False))
return_sequences 5, activation = 'relu'))
model.add(Dense(1)) model.add(Dense(
Build model...
compile(optimizer='adam', loss='mean_squared_error')
model. model.summary()
Model: "sequential_4"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm_12 (LSTM) (None, 90, 90) 35640
lstm_13 (LSTM) (None, 90, 50) 28200
lstm_14 (LSTM) (None, 25) 7600
dense_8 (Dense) (None, 5) 130
dense_9 (Dense) (None, 1) 6
=================================================================
Total params: 71,576
Trainable params: 71,576
Non-trainable params: 0
_________________________________________________________________
# Training the model
# fit network
= model.fit(x_train,
history
y_train, =1000,
batch_size=30,
epochs= (x_test, y_test),
validation_data = 0) verbose
'loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.plot(history.history[
plt.legend() plt.show()
# Get the predicted values
= model.predict(x_test)
predictions = scaler_pred.inverse_transform(predictions) predictions
WARNING:tensorflow:5 out of the last 13 calls to <function Model.make_predict_function.<locals>.predict_function at 0x7fd54fad4430> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has reduce_retracing=True option that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for more details.
1/3 [=========>....................] - ETA: 0s
3/3 [==============================] - ETA: 0s
3/3 [==============================] - 0s 31ms/step
= y_test.reshape(-1,1)
y_test = scaler_pred.inverse_transform(y_test) y_test
# Calculate the mean absolute error (MAE)
= mean_absolute_error(y_test, predictions)
mae print('MAE: ' + str(round(mae, 1)))
# Calculate the root mean squarred error (RMSE)
= np.sqrt(mean_squared_error(y_test,predictions))
rmse print('RMSE: ' + str(round(rmse, 1)))
# Calculate the root mean squarred error (RMSE)
= mean_squared_error(y_test,
rmse
predictions,= False)
squared print('RMSE: ' + str(round(rmse, 1)))
MAE: 1622.4
RMSE: 1889.7
RMSE: 1889.7
# Add the difference between the valid and predicted prices
= data_sevilla[:(len(x_train)+92)]
train = data_sevilla[(len(x_train)+91):] valid
1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True) valid.insert(
# Zoom-in to a closer timeframe
# Date from which on the date is displayed
= "2021-10-31"
display_start_date = valid[valid.index > display_start_date]
valid = train[train.index > display_start_date] train
# Visualize the data
'ggplot')
matplotlib.style.use(= plt.subplots(figsize=(22, 10), sharex=True)
fig, ax1
# Data - Train
= train.index;
xt = train[["num_casos"]]
yt # Data - Test / validation
= valid.index;
xv = valid[["num_casos", "Predictions"]]
yv
# Plot
"sevilla: Predictions vs Real infections", fontsize=20)
plt.title("Nº Cases", fontsize=18)
plt.ylabel(
="blue", linewidth=1.5)
plt.plot(yt, color"Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.plot(yv["Train", "LSTM Predictions", "Test"],
plt.legend([="upper left", fontsize=18)
loc
# Bar plot with the differences
= valid.index
x = valid["Difference"]
y =0.2, color="grey")
plt.bar(x, y, width
plt.grid() plt.show()
Just before the sixth wave
Asturias
= data_covid.loc[data_covid['provincia'] == 'Asturias']
data_asturias = data_asturias.set_index('fecha')
data_asturias = data_asturias['2020-06-14':'2021-12-31']
data_asturias = data_asturias.filter(['num_casos', 'tmed', 'mob_grocery_pharmacy',
data_asturias 'mob_parks', 'mob_residential', 'mob_residential', 'mob_transit_stations', 'mob_workplaces'])
data_asturias
num_casos | tmed | mob_grocery_pharmacy | mob_parks | mob_residential | mob_residential | mob_transit_stations | mob_workplaces | |
---|---|---|---|---|---|---|---|---|
fecha | ||||||||
2020-06-14 | 0 | 16.8 | -8.0 | 6.0 | 4.0 | 4.0 | -43.0 | -2.0 |
2020-06-15 | 0 | 16.0 | -6.0 | 39.0 | 7.0 | 7.0 | -38.0 | -27.0 |
2020-06-16 | 1 | 15.6 | -6.0 | 7.0 | 9.0 | 9.0 | -43.0 | -29.0 |
2020-06-17 | 0 | 15.6 | -7.0 | 20.0 | 8.0 | 8.0 | -42.0 | -28.0 |
2020-06-18 | 0 | 15.1 | -4.0 | 68.0 | 7.0 | 7.0 | -39.0 | -28.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
2021-12-27 | 2363 | 16.6 | 21.0 | 57.0 | 5.0 | 5.0 | -9.0 | -35.0 |
2021-12-28 | 2150 | 19.2 | 18.0 | 90.0 | 4.0 | 4.0 | -6.0 | -36.0 |
2021-12-29 | 2159 | 15.6 | 24.0 | 108.0 | 4.0 | 4.0 | 0.0 | -34.0 |
2021-12-30 | 2020 | 13.2 | 39.0 | 88.0 | 4.0 | 4.0 | -10.0 | -36.0 |
2021-12-31 | 1949 | 14.6 | 21.0 | 46.0 | 10.0 | 10.0 | -21.0 | -54.0 |
566 rows × 8 columns
data_asturias.describe()
num_casos | tmed | mob_grocery_pharmacy | mob_parks | mob_residential | mob_residential | mob_transit_stations | mob_workplaces | |
---|---|---|---|---|---|---|---|---|
count | 566.000000 | 566.000000 | 566.000000 | 566.000000 | 566.000000 | 566.000000 | 566.000000 | 566.000000 |
mean | 180.913428 | 14.495671 | 4.159011 | 62.592756 | 3.798587 | 3.798587 | -12.772085 | -21.289753 |
std | 276.250633 | 4.098429 | 15.207805 | 71.365461 | 5.051424 | 5.051424 | 15.668332 | 13.018468 |
min | 0.000000 | 3.300000 | -87.000000 | -63.000000 | -8.000000 | -8.000000 | -67.000000 | -80.000000 |
25% | 36.000000 | 11.525000 | -0.375000 | 14.000000 | 0.000000 | 0.000000 | -23.000000 | -27.000000 |
50% | 94.500000 | 15.000000 | 5.000000 | 43.000000 | 3.000000 | 3.000000 | -13.000000 | -20.000000 |
75% | 225.750000 | 18.000000 | 11.000000 | 91.000000 | 7.000000 | 7.000000 | -2.000000 | -15.000000 |
max | 2363.000000 | 25.100000 | 42.000000 | 333.000000 | 26.000000 | 26.000000 | 26.000000 | 12.000000 |
= data_asturias.values np_data_asturias
# Train dataset
= MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit_transform(np_data_asturias)
scaled_data_asturias print(f'Longitud del conjunto de datos disponible: {len(scaled_data_asturias)}')
Longitud del conjunto de datos disponible: 566
# Since we are going to predict future values based on the 90 past elements,
# we need to create a list with those historic information for each element
= 90
historic_values = []
scaled_data_asturias_x = []
scaled_data_asturias_y
for i in range(historic_values, len(scaled_data_asturias)):
-historic_values):i, :])
scaled_data_asturias_x.append(scaled_data_asturias[(i0])
scaled_data_asturias_y.append(scaled_data_asturias[i,
# Convert the x_train and y_train to numpy arrays
= np.array(scaled_data_asturias_x)
scaled_data_asturias_x = np.array(scaled_data_asturias_y) scaled_data_asturias_y
# Once predicted, we are going to need a exclusive scaler for num_cases
= MinMaxScaler(feature_range=(0, 1))
scaler_pred = pd.DataFrame(data_asturias['num_casos'])
df_cases_asturias = scaler_pred.fit_transform(df_cases_asturias) scaled_data_asturias_pred
# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_asturias_y)}')
Longitud datos de entrenamiento con historico: 476
# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 90 days
= scaled_data_asturias_x[0:len(scaled_data_asturias_x)-91]
x_train = scaled_data_asturias_y[0:len(scaled_data_asturias_y)-91]
y_train print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')
= scaled_data_asturias_x[len(scaled_data_asturias_x)-90:len(scaled_data_asturias_x)]
x_test = scaled_data_asturias_y[len(scaled_data_asturias_y)-90:len(scaled_data_asturias_y)]
y_test print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')
Cantidad datos de entrenamiento: x=385 - y=385
Cantidad datos de test: x=90 - y=90
# Reshape the data to feed de recurrent network
= np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 8))
x_train print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
= np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 8))
x_test print("Test data shape:")
print(x_test.shape)
print(y_test.shape)
Train data shape:
(385, 90, 8)
(385,)
Test data shape:
(90, 90, 8)
(90,)
# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
= Sequential()
model # Model with Neurons
# Inputshape = neurons -> Timestamps
= x_train.shape[1]
neurons90,
model.add(LSTM(= 'relu',
activation = True,
return_sequences = (x_train.shape[1], 8)))
input_shape 50,
model.add(LSTM(= 'relu',
activation = True))
return_sequences 25,
model.add(LSTM(= 'relu',
activation = False))
return_sequences 5, activation = 'relu'))
model.add(Dense(1)) model.add(Dense(
Build model...
compile(optimizer='adam', loss='mean_squared_error')
model. model.summary()
Model: "sequential_5"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm_15 (LSTM) (None, 90, 90) 35640
lstm_16 (LSTM) (None, 90, 50) 28200
lstm_17 (LSTM) (None, 25) 7600
dense_10 (Dense) (None, 5) 130
dense_11 (Dense) (None, 1) 6
=================================================================
Total params: 71,576
Trainable params: 71,576
Non-trainable params: 0
_________________________________________________________________
# Training the model
# fit network
= model.fit(x_train,
history
y_train, =1000,
batch_size=50,
epochs= (x_test, y_test),
validation_data = 0) verbose
'loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.plot(history.history[
plt.legend() plt.show()
# Get the predicted values
= model.predict(x_test)
predictions = scaler_pred.inverse_transform(predictions) predictions
WARNING:tensorflow:5 out of the last 13 calls to <function Model.make_predict_function.<locals>.predict_function at 0x7fd53efb6310> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has reduce_retracing=True option that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for more details.
1/3 [=========>....................] - ETA: 0s
3/3 [==============================] - ETA: 0s
3/3 [==============================] - 0s 31ms/step
= y_test.reshape(-1,1)
y_test = scaler_pred.inverse_transform(y_test) y_test
# Calculate the mean absolute error (MAE)
= mean_absolute_error(y_test, predictions)
mae print('MAE: ' + str(round(mae, 1)))
# Calculate the root mean squarred error (RMSE)
= np.sqrt(mean_squared_error(y_test,predictions))
rmse print('RMSE: ' + str(round(rmse, 1)))
# Calculate the root mean squarred error (RMSE)
= mean_squared_error(y_test,
rmse
predictions,= False)
squared print('RMSE: ' + str(round(rmse, 1)))
MAE: 263.9
RMSE: 465.3
RMSE: 465.3
# Add the difference between the valid and predicted prices
= data_asturias[:(len(x_train)+92)]
train = data_asturias[(len(x_train)+91):] valid
1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True) valid.insert(
# Zoom-in to a closer timeframe
# Date from which on the date is displayed
= "2021-07-15"
display_start_date = valid[valid.index > display_start_date]
valid = train[train.index > display_start_date] train
# Visualize the data
'ggplot')
matplotlib.style.use(= plt.subplots(figsize=(22, 10), sharex=True)
fig, ax1
# Data - Train
= train.index;
xt = train[["num_casos"]]
yt # Data - Test / validation
= valid.index;
xv = valid[["num_casos", "Predictions"]]
yv
# Plot
"Asturias: Predictions vs Real infections", fontsize=20)
plt.title("Nº Cases", fontsize=18)
plt.ylabel(
="blue", linewidth=1.5)
plt.plot(yt, color"Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.plot(yv["Train", "LSTM Predictions", "Test"],
plt.legend([="upper left", fontsize=18)
loc
# Bar plot with the differences
= valid.index
x = valid["Difference"]
y =0.2, color="grey")
plt.bar(x, y, width
plt.grid() plt.show()
Barcelona
= data_covid.loc[data_covid['provincia'] == 'Barcelona']
data_barcelona = data_barcelona.set_index('fecha')
data_barcelona = data_barcelona['2020-06-14':'2021-12-31']
data_barcelona = data_barcelona.filter(['num_casos', 'tmed', 'mob_grocery_pharmacy',
data_barcelona 'mob_parks', 'mob_residential', 'mob_residential', 'mob_transit_stations', 'mob_workplaces'])
data_barcelona
num_casos | tmed | mob_grocery_pharmacy | mob_parks | mob_residential | mob_residential | mob_transit_stations | mob_workplaces | |
---|---|---|---|---|---|---|---|---|
fecha | ||||||||
2020-06-14 | 33 | 20.6 | -20.0 | 15.0 | 6.0 | 6.0 | -31.0 | -6.0 |
2020-06-15 | 62 | 21.2 | -9.0 | 17.0 | 14.0 | 14.0 | -34.0 | -41.0 |
2020-06-16 | 66 | 20.8 | -10.0 | -16.0 | 16.0 | 16.0 | -39.0 | -41.0 |
2020-06-17 | 70 | 20.3 | -4.0 | 14.0 | 15.0 | 15.0 | -33.0 | -40.0 |
2020-06-18 | 68 | 18.8 | -8.0 | -6.0 | 16.0 | 16.0 | -38.0 | -41.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
2021-12-27 | 19383 | 16.4 | 15.0 | 7.0 | 12.0 | 12.0 | -25.0 | -52.0 |
2021-12-28 | 20192 | 17.7 | 17.0 | 7.0 | 13.0 | 13.0 | -25.0 | -52.0 |
2021-12-29 | 19361 | 17.0 | 23.0 | 10.0 | 13.0 | 13.0 | -25.0 | -51.0 |
2021-12-30 | 17639 | 15.8 | 37.0 | 6.0 | 12.0 | 12.0 | -24.0 | -53.0 |
2021-12-31 | 16651 | 13.6 | 26.0 | -12.0 | 16.0 | 16.0 | -36.0 | -60.0 |
566 rows × 8 columns
data_barcelona.describe()
num_casos | tmed | mob_grocery_pharmacy | mob_parks | mob_residential | mob_residential | mob_transit_stations | mob_workplaces | |
---|---|---|---|---|---|---|---|---|
count | 566.000000 | 566.000000 | 566.000000 | 566.000000 | 566.000000 | 566.000000 | 566.000000 | 566.000000 |
mean | 1575.480565 | 18.205477 | -0.574205 | -4.651943 | 7.353357 | 7.353357 | -24.865724 | -28.471731 |
std | 2281.290383 | 6.086976 | 16.775476 | 15.728472 | 5.162377 | 5.162377 | 11.574598 | 16.238872 |
min | 33.000000 | 4.300000 | -83.000000 | -76.000000 | -6.000000 | -6.000000 | -72.000000 | -87.000000 |
25% | 544.750000 | 12.800000 | -7.000000 | -14.750000 | 4.000000 | 4.000000 | -33.000000 | -36.000000 |
50% | 944.500000 | 17.800000 | 0.500000 | -4.000000 | 8.000000 | 8.000000 | -25.000000 | -26.000000 |
75% | 1609.500000 | 24.375000 | 10.000000 | 7.000000 | 10.000000 | 10.000000 | -15.000000 | -19.250000 |
max | 20192.000000 | 28.400000 | 58.000000 | 33.000000 | 32.000000 | 32.000000 | -2.000000 | 6.000000 |
= data_barcelona.values np_data_barcelona
# Train dataset
= MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit_transform(np_data_barcelona)
scaled_data_barcelona print(f'Longitud del conjunto de datos disponible: {len(scaled_data_barcelona)}')
Longitud del conjunto de datos disponible: 566
# Since we are going to predict future values based on the 90 past elements,
# we need to create a list with those historic information for each element
= 90
historic_values = []
scaled_data_barcelona_x = []
scaled_data_barcelona_y
for i in range(historic_values, len(scaled_data_barcelona)):
-historic_values):i, :])
scaled_data_barcelona_x.append(scaled_data_barcelona[(i0])
scaled_data_barcelona_y.append(scaled_data_barcelona[i,
# Convert the x_train and y_train to numpy arrays
= np.array(scaled_data_barcelona_x)
scaled_data_barcelona_x = np.array(scaled_data_barcelona_y) scaled_data_barcelona_y
# Once predicted, we are going to need a exclusive scaler for num_cases
= MinMaxScaler(feature_range=(0, 1))
scaler_pred = pd.DataFrame(data_barcelona['num_casos'])
df_cases_barcelona = scaler_pred.fit_transform(df_cases_barcelona) scaled_data_barcelona_pred
# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_barcelona_y)}')
Longitud datos de entrenamiento con historico: 476
# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 90 days
= scaled_data_barcelona_x[0:len(scaled_data_barcelona_x)-91]
x_train = scaled_data_barcelona_y[0:len(scaled_data_barcelona_y)-91]
y_train print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')
= scaled_data_barcelona_x[len(scaled_data_barcelona_x)-90:len(scaled_data_barcelona_x)]
x_test = scaled_data_barcelona_y[len(scaled_data_barcelona_y)-90:len(scaled_data_barcelona_y)]
y_test print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')
Cantidad datos de entrenamiento: x=385 - y=385
Cantidad datos de test: x=90 - y=90
# Reshape the data to feed de recurrent network
= np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 8))
x_train print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
= np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 8))
x_test print("Test data shape:")
print(x_test.shape)
print(y_test.shape)
Train data shape:
(385, 90, 8)
(385,)
Test data shape:
(90, 90, 8)
(90,)
# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
= Sequential()
model # Model with Neurons
# Inputshape = neurons -> Timestamps
= x_train.shape[1]
neurons90,
model.add(LSTM(= 'relu',
activation = True,
return_sequences = (x_train.shape[1], 8)))
input_shape 50,
model.add(LSTM(= 'relu',
activation = True))
return_sequences 25,
model.add(LSTM(= 'relu',
activation = False))
return_sequences 5, activation = 'relu'))
model.add(Dense(1)) model.add(Dense(
Build model...
compile(optimizer='adam', loss='mean_squared_error')
model. model.summary()
Model: "sequential_6"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm_18 (LSTM) (None, 90, 90) 35640
lstm_19 (LSTM) (None, 90, 50) 28200
lstm_20 (LSTM) (None, 25) 7600
dense_12 (Dense) (None, 5) 130
dense_13 (Dense) (None, 1) 6
=================================================================
Total params: 71,576
Trainable params: 71,576
Non-trainable params: 0
_________________________________________________________________
# Training the model
# fit network
= model.fit(x_train,
history
y_train, =1000,
batch_size=50,
epochs= (x_test, y_test),
validation_data = 0) verbose
'loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.plot(history.history[
plt.legend() plt.show()
# Get the predicted values
= model.predict(x_test)
predictions = scaler_pred.inverse_transform(predictions) predictions
1/3 [=========>....................] - ETA: 0s
3/3 [==============================] - ETA: 0s
3/3 [==============================] - 0s 31ms/step
= y_test.reshape(-1,1)
y_test = scaler_pred.inverse_transform(y_test) y_test
# Calculate the mean absolute error (MAE)
= mean_absolute_error(y_test, predictions)
mae print('MAE: ' + str(round(mae, 1)))
# Calculate the root mean squarred error (RMSE)
= np.sqrt(mean_squared_error(y_test,predictions))
rmse print('RMSE: ' + str(round(rmse, 1)))
# Calculate the root mean squarred error (RMSE)
= mean_squared_error(y_test,
rmse
predictions,= False)
squared print('RMSE: ' + str(round(rmse, 1)))
MAE: 2657.6
RMSE: 4156.6
RMSE: 4156.6
# Add the difference between the valid and predicted prices
= data_barcelona[:(len(x_train)+92)]
train = data_barcelona[(len(x_train)+91):] valid
1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True) valid.insert(
# Zoom-in to a closer timeframe
# Date from which on the date is displayed
= "2021-07-15"
display_start_date = valid[valid.index > display_start_date]
valid = train[train.index > display_start_date] train
# Visualize the data
'ggplot')
matplotlib.style.use(= plt.subplots(figsize=(22, 10), sharex=True)
fig, ax1
# Data - Train
= train.index;
xt = train[["num_casos"]]
yt # Data - Test / validation
= valid.index;
xv = valid[["num_casos", "Predictions"]]
yv
# Plot
"barcelona: Predictions vs Real infections", fontsize=20)
plt.title("Nº Cases", fontsize=18)
plt.ylabel(
="blue", linewidth=1.5)
plt.plot(yt, color"Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.plot(yv["Train", "LSTM Predictions", "Test"],
plt.legend([="upper left", fontsize=18)
loc
# Bar plot with the differences
= valid.index
x = valid["Difference"]
y =0.2, color="grey")
plt.bar(x, y, width
plt.grid() plt.show()
Madrid
= data_covid.loc[data_covid['provincia'] == 'Madrid']
data_madrid = data_madrid.set_index('fecha')
data_madrid = data_madrid['2020-06-14':'2021-12-31']
data_madrid = data_madrid.filter(['num_casos', 'tmed', 'mob_grocery_pharmacy',
data_madrid 'mob_parks', 'mob_residential', 'mob_residential', 'mob_transit_stations', 'mob_workplaces'])
data_madrid
num_casos | tmed | mob_grocery_pharmacy | mob_parks | mob_residential | mob_residential | mob_transit_stations | mob_workplaces | |
---|---|---|---|---|---|---|---|---|
fecha | ||||||||
2020-06-14 | 81 | 19.0 | -15.0 | 25.0 | 7.0 | 7.0 | -43.0 | -6.0 |
2020-06-15 | 153 | 20.7 | -8.0 | 34.0 | 16.0 | 16.0 | -43.0 | -47.0 |
2020-06-16 | 91 | 21.0 | -8.0 | 21.0 | 17.0 | 17.0 | -43.0 | -47.0 |
2020-06-17 | 93 | 20.2 | -5.0 | 34.0 | 17.0 | 17.0 | -42.0 | -47.0 |
2020-06-18 | 85 | 21.9 | -7.0 | 30.0 | 16.0 | 16.0 | -44.0 | -47.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
2021-12-27 | 22958 | 12.3 | 21.0 | 19.0 | 12.0 | 12.0 | -28.0 | -52.0 |
2021-12-28 | 23811 | 10.4 | 18.0 | 23.0 | 12.0 | 12.0 | -30.0 | -52.0 |
2021-12-29 | 21914 | 9.0 | 27.0 | 18.0 | 13.0 | 13.0 | -30.0 | -52.0 |
2021-12-30 | 20666 | 8.7 | 42.0 | 17.0 | 12.0 | 12.0 | -30.0 | -53.0 |
2021-12-31 | 7556 | 9.2 | 10.0 | -15.0 | 18.0 | 18.0 | -47.0 | -69.0 |
566 rows × 8 columns
data_madrid.describe()
num_casos | tmed | mob_grocery_pharmacy | mob_parks | mob_residential | mob_residential | mob_transit_stations | mob_workplaces | |
---|---|---|---|---|---|---|---|---|
count | 566.000000 | 566.000000 | 566.000000 | 566.000000 | 566.000000 | 566.000000 | 566.000000 | 566.000000 |
mean | 1994.136042 | 16.004064 | -3.793286 | -4.438163 | 7.238516 | 7.238516 | -31.560071 | -28.715548 |
std | 2795.419848 | 8.085617 | 14.945139 | 21.384112 | 5.850467 | 5.850467 | 12.292703 | 19.230654 |
min | 28.000000 | -6.200000 | -84.000000 | -62.000000 | -6.000000 | -6.000000 | -73.000000 | -86.000000 |
25% | 550.250000 | 9.525000 | -11.000000 | -17.750000 | 4.000000 | 4.000000 | -40.000000 | -41.000000 |
50% | 1312.500000 | 14.700000 | -3.000000 | -3.000000 | 7.000000 | 7.000000 | -33.000000 | -28.000000 |
75% | 2282.500000 | 23.200000 | 6.000000 | 11.750000 | 11.000000 | 11.000000 | -24.000000 | -17.000000 |
max | 23811.000000 | 32.700000 | 50.000000 | 77.000000 | 31.000000 | 31.000000 | 1.000000 | 20.000000 |
= data_madrid.values np_data_madrid
# Train dataset
= MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit_transform(np_data_madrid)
scaled_data_madrid print(f'Longitud del conjunto de datos disponible: {len(scaled_data_madrid)}')
Longitud del conjunto de datos disponible: 566
# Since we are going to predict future values based on the 90 past elements,
# we need to create a list with those historic information for each element
= 90
historic_values = []
scaled_data_madrid_x = []
scaled_data_madrid_y
for i in range(historic_values, len(scaled_data_madrid)):
-historic_values):i, :])
scaled_data_madrid_x.append(scaled_data_madrid[(i0])
scaled_data_madrid_y.append(scaled_data_madrid[i,
# Convert the x_train and y_train to numpy arrays
= np.array(scaled_data_madrid_x)
scaled_data_madrid_x = np.array(scaled_data_madrid_y) scaled_data_madrid_y
# Once predicted, we are going to need a exclusive scaler for num_cases
= MinMaxScaler(feature_range=(0, 1))
scaler_pred = pd.DataFrame(data_madrid['num_casos'])
df_cases_madrid = scaler_pred.fit_transform(df_cases_madrid) scaled_data_madrid_pred
# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_madrid_y)}')
Longitud datos de entrenamiento con historico: 476
# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 90 days
= scaled_data_madrid_x[0:len(scaled_data_madrid_x)-91]
x_train = scaled_data_madrid_y[0:len(scaled_data_madrid_y)-91]
y_train print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')
= scaled_data_madrid_x[len(scaled_data_madrid_x)-90:len(scaled_data_madrid_x)]
x_test = scaled_data_madrid_y[len(scaled_data_madrid_y)-90:len(scaled_data_madrid_y)]
y_test print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')
Cantidad datos de entrenamiento: x=385 - y=385
Cantidad datos de test: x=90 - y=90
# Reshape the data to feed de recurrent network
= np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 8))
x_train print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
= np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 8))
x_test print("Test data shape:")
print(x_test.shape)
print(y_test.shape)
Train data shape:
(385, 90, 8)
(385,)
Test data shape:
(90, 90, 8)
(90,)
# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
= Sequential()
model # Model with Neurons
# Inputshape = neurons -> Timestamps
= x_train.shape[1]
neurons90,
model.add(LSTM(= 'relu',
activation = True,
return_sequences = (x_train.shape[1], 8)))
input_shape 50,
model.add(LSTM(= 'relu',
activation = True))
return_sequences 25,
model.add(LSTM(= 'relu',
activation = False))
return_sequences 5, activation = 'relu'))
model.add(Dense(1)) model.add(Dense(
Build model...
compile(optimizer='adam', loss='mean_squared_error')
model. model.summary()
Model: "sequential_7"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm_21 (LSTM) (None, 90, 90) 35640
lstm_22 (LSTM) (None, 90, 50) 28200
lstm_23 (LSTM) (None, 25) 7600
dense_14 (Dense) (None, 5) 130
dense_15 (Dense) (None, 1) 6
=================================================================
Total params: 71,576
Trainable params: 71,576
Non-trainable params: 0
_________________________________________________________________
# Training the model
# fit network
= model.fit(x_train,
history
y_train, =1000,
batch_size=50,
epochs= (x_test, y_test),
validation_data = 0) verbose
'loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.plot(history.history[
plt.legend() plt.show()
# Get the predicted values
= model.predict(x_test)
predictions = scaler_pred.inverse_transform(predictions) predictions
1/3 [=========>....................] - ETA: 0s
3/3 [==============================] - ETA: 0s
3/3 [==============================] - 0s 31ms/step
= y_test.reshape(-1,1)
y_test = scaler_pred.inverse_transform(y_test) y_test
# Calculate the mean absolute error (MAE)
= mean_absolute_error(y_test, predictions)
mae print('MAE: ' + str(round(mae, 1)))
# Calculate the root mean squarred error (RMSE)
= np.sqrt(mean_squared_error(y_test,predictions))
rmse print('RMSE: ' + str(round(rmse, 1)))
# Calculate the root mean squarred error (RMSE)
= mean_squared_error(y_test,
rmse
predictions,= False)
squared print('RMSE: ' + str(round(rmse, 1)))
MAE: 2755.3
RMSE: 5897.2
RMSE: 5897.2
# Add the difference between the valid and predicted prices
= data_madrid[:(len(x_train)+92)]
train = data_madrid[(len(x_train)+91):] valid
1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True) valid.insert(
# Zoom-in to a closer timeframe
# Date from which on the date is displayed
= "2021-07-15"
display_start_date = valid[valid.index > display_start_date]
valid = train[train.index > display_start_date] train
# Visualize the data
'ggplot')
matplotlib.style.use(= plt.subplots(figsize=(22, 10), sharex=True)
fig, ax1
# Data - Train
= train.index;
xt = train[["num_casos"]]
yt # Data - Test / validation
= valid.index;
xv = valid[["num_casos", "Predictions"]]
yv
# Plot
"madrid: Predictions vs Real infections", fontsize=20)
plt.title("Nº Cases", fontsize=18)
plt.ylabel(
="blue", linewidth=1.5)
plt.plot(yt, color"Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.plot(yv["Train", "LSTM Predictions", "Test"],
plt.legend([="upper left", fontsize=18)
loc
# Bar plot with the differences
= valid.index
x = valid["Difference"]
y =0.2, color="grey")
plt.bar(x, y, width
plt.grid() plt.show()
Málaga
= data_covid.loc[data_covid['provincia'] == 'Málaga']
data_malaga = data_malaga.set_index('fecha')
data_malaga = data_malaga['2020-06-14':'2021-12-31']
data_malaga = data_malaga.filter(['num_casos', 'tmed', 'mob_grocery_pharmacy',
data_malaga 'mob_parks', 'mob_residential', 'mob_residential', 'mob_transit_stations', 'mob_workplaces'])
data_malaga
num_casos | tmed | mob_grocery_pharmacy | mob_parks | mob_residential | mob_residential | mob_transit_stations | mob_workplaces | |
---|---|---|---|---|---|---|---|---|
fecha | ||||||||
2020-06-14 | 2 | 26.6 | -20.0 | -15.0 | 2.0 | 2.0 | -53.0 | -7.0 |
2020-06-15 | 1 | 27.2 | -16.0 | -9.0 | 9.0 | 9.0 | -44.0 | -33.0 |
2020-06-16 | 1 | 27.8 | -13.0 | -4.0 | 9.0 | 9.0 | -40.0 | -33.0 |
2020-06-17 | 2 | 26.9 | -13.0 | 1.0 | 9.0 | 9.0 | -41.0 | -33.0 |
2020-06-18 | 2 | 21.6 | -12.0 | 3.0 | 9.0 | 9.0 | -42.0 | -33.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
2021-12-27 | 1627 | 17.9 | 15.0 | 17.0 | 7.0 | 7.0 | -12.0 | -37.0 |
2021-12-28 | 2772 | 17.8 | 18.0 | 30.0 | 7.0 | 7.0 | -7.0 | -37.0 |
2021-12-29 | 3080 | 17.0 | 25.0 | 37.0 | 6.0 | 6.0 | -8.0 | -37.0 |
2021-12-30 | 3075 | 14.4 | 39.0 | 33.0 | 5.0 | 5.0 | -8.0 | -37.0 |
2021-12-31 | 2646 | 11.1 | 22.0 | -5.0 | 12.0 | 12.0 | -34.0 | -50.0 |
566 rows × 8 columns
data_malaga.describe()
num_casos | tmed | mob_grocery_pharmacy | mob_parks | mob_residential | mob_residential | mob_transit_stations | mob_workplaces | |
---|---|---|---|---|---|---|---|---|
count | 566.000000 | 566.000000 | 566.000000 | 566.000000 | 566.000000 | 566.000000 | 566.000000 | 566.000000 |
mean | 339.227915 | 20.332332 | 8.427562 | 15.765018 | 4.588339 | 4.588339 | -19.860424 | -18.724382 |
std | 427.261346 | 5.690349 | 27.032414 | 41.263233 | 4.227823 | 4.227823 | 19.025924 | 14.733563 |
min | 1.000000 | 6.200000 | -80.000000 | -63.000000 | -4.000000 | -4.000000 | -70.000000 | -79.000000 |
25% | 110.000000 | 15.600000 | -5.000000 | -14.000000 | 2.000000 | 2.000000 | -34.000000 | -27.000000 |
50% | 193.500000 | 20.000000 | 4.000000 | 7.000000 | 4.000000 | 4.000000 | -23.000000 | -18.000000 |
75% | 344.750000 | 25.075000 | 16.000000 | 36.000000 | 7.000000 | 7.000000 | -4.000000 | -12.000000 |
max | 3080.000000 | 34.500000 | 156.000000 | 152.000000 | 22.000000 | 22.000000 | 20.000000 | 29.000000 |
= data_malaga.values np_data_malaga
# Train dataset
= MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit_transform(np_data_malaga)
scaled_data_malaga print(f'Longitud del conjunto de datos disponible: {len(scaled_data_malaga)}')
Longitud del conjunto de datos disponible: 566
# Since we are going to predict future values based on the 90 past elements,
# we need to create a list with those historic information for each element
= 90
historic_values = []
scaled_data_malaga_x = []
scaled_data_malaga_y
for i in range(historic_values, len(scaled_data_malaga)):
-historic_values):i, :])
scaled_data_malaga_x.append(scaled_data_malaga[(i0])
scaled_data_malaga_y.append(scaled_data_malaga[i,
# Convert the x_train and y_train to numpy arrays
= np.array(scaled_data_malaga_x)
scaled_data_malaga_x = np.array(scaled_data_malaga_y) scaled_data_malaga_y
# Once predicted, we are going to need a exclusive scaler for num_cases
= MinMaxScaler(feature_range=(0, 1))
scaler_pred = pd.DataFrame(data_malaga['num_casos'])
df_cases_malaga = scaler_pred.fit_transform(df_cases_malaga) scaled_data_malaga_pred
# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_malaga_y)}')
Longitud datos de entrenamiento con historico: 476
# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 90 days
= scaled_data_malaga_x[0:len(scaled_data_malaga_x)-91]
x_train = scaled_data_malaga_y[0:len(scaled_data_malaga_y)-91]
y_train print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')
= scaled_data_malaga_x[len(scaled_data_malaga_x)-90:len(scaled_data_malaga_x)]
x_test = scaled_data_malaga_y[len(scaled_data_malaga_y)-90:len(scaled_data_malaga_y)]
y_test print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')
Cantidad datos de entrenamiento: x=385 - y=385
Cantidad datos de test: x=90 - y=90
# Reshape the data to feed de recurrent network
= np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 8))
x_train print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
= np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 8))
x_test print("Test data shape:")
print(x_test.shape)
print(y_test.shape)
Train data shape:
(385, 90, 8)
(385,)
Test data shape:
(90, 90, 8)
(90,)
# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
= Sequential()
model # Model with Neurons
# Inputshape = neurons -> Timestamps
= x_train.shape[1]
neurons90,
model.add(LSTM(= 'relu',
activation = True,
return_sequences = (x_train.shape[1], 8)))
input_shape 50,
model.add(LSTM(= 'relu',
activation = True))
return_sequences 25,
model.add(LSTM(= 'relu',
activation = False))
return_sequences 5, activation = 'relu'))
model.add(Dense(1)) model.add(Dense(
Build model...
compile(optimizer='adam', loss='mean_squared_error')
model. model.summary()
Model: "sequential_8"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm_24 (LSTM) (None, 90, 90) 35640
lstm_25 (LSTM) (None, 90, 50) 28200
lstm_26 (LSTM) (None, 25) 7600
dense_16 (Dense) (None, 5) 130
dense_17 (Dense) (None, 1) 6
=================================================================
Total params: 71,576
Trainable params: 71,576
Non-trainable params: 0
_________________________________________________________________
# Training the model
# fit network
= model.fit(x_train,
history
y_train, =1000,
batch_size=50,
epochs= (x_test, y_test),
validation_data = 0) verbose
'loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.plot(history.history[
plt.legend() plt.show()
# Get the predicted values
= model.predict(x_test)
predictions = scaler_pred.inverse_transform(predictions) predictions
1/3 [=========>....................] - ETA: 0s
3/3 [==============================] - ETA: 0s
3/3 [==============================] - 0s 31ms/step
= y_test.reshape(-1,1)
y_test = scaler_pred.inverse_transform(y_test) y_test
# Calculate the mean absolute error (MAE)
= mean_absolute_error(y_test, predictions)
mae print('MAE: ' + str(round(mae, 1)))
# Calculate the root mean squarred error (RMSE)
= np.sqrt(mean_squared_error(y_test,predictions))
rmse print('RMSE: ' + str(round(rmse, 1)))
# Calculate the root mean squarred error (RMSE)
= mean_squared_error(y_test,
rmse
predictions,= False)
squared print('RMSE: ' + str(round(rmse, 1)))
MAE: 301.5
RMSE: 408.5
RMSE: 408.5
# Add the difference between the valid and predicted prices
= data_malaga[:(len(x_train)+92)]
train = data_malaga[(len(x_train)+91):] valid
1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True) valid.insert(
# Zoom-in to a closer timeframe
# Date from which on the date is displayed
= "2021-07-15"
display_start_date = valid[valid.index > display_start_date]
valid = train[train.index > display_start_date] train
# Visualize the data
'ggplot')
matplotlib.style.use(= plt.subplots(figsize=(22, 10), sharex=True)
fig, ax1
# Data - Train
= train.index;
xt = train[["num_casos"]]
yt # Data - Test / validation
= valid.index;
xv = valid[["num_casos", "Predictions"]]
yv
# Plot
"malaga: Predictions vs Real infections", fontsize=20)
plt.title("Nº Cases", fontsize=18)
plt.ylabel(
="blue", linewidth=1.5)
plt.plot(yt, color"Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.plot(yv["Train", "LSTM Predictions", "Test"],
plt.legend([="upper left", fontsize=18)
loc
# Bar plot with the differences
= valid.index
x = valid["Difference"]
y =0.2, color="grey")
plt.bar(x, y, width
plt.grid() plt.show()
Sevilla
= data_covid.loc[data_covid['provincia'] == 'Sevilla']
data_sevilla = data_sevilla.set_index('fecha')
data_sevilla = data_sevilla['2020-06-14':'2021-12-31']
data_sevilla = data_sevilla.filter(['num_casos', 'tmed', 'mob_grocery_pharmacy',
data_sevilla 'mob_parks', 'mob_residential', 'mob_residential', 'mob_transit_stations', 'mob_workplaces'])
data_sevilla
num_casos | tmed | mob_grocery_pharmacy | mob_parks | mob_residential | mob_residential | mob_transit_stations | mob_workplaces | |
---|---|---|---|---|---|---|---|---|
fecha | ||||||||
2020-06-14 | 0 | 22.200000 | -24.0 | -36.0 | 0.0 | 0.0 | -51.0 | 0.0 |
2020-06-15 | 2 | 23.088889 | -14.0 | -13.0 | 9.0 | 9.0 | -41.0 | -33.0 |
2020-06-16 | 1 | 23.977778 | -10.0 | -4.0 | 9.0 | 9.0 | -37.0 | -32.0 |
2020-06-17 | 0 | 24.866667 | -10.0 | -7.0 | 9.0 | 9.0 | -37.0 | -32.0 |
2020-06-18 | 0 | 25.755556 | -12.0 | -13.0 | 9.0 | 9.0 | -41.0 | -33.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
2021-12-27 | 2617 | 17.600000 | 16.0 | -1.0 | 7.0 | 7.0 | -19.0 | -39.0 |
2021-12-28 | 3190 | 16.000000 | 17.0 | 15.0 | 7.0 | 7.0 | -16.0 | -37.0 |
2021-12-29 | 3692 | 14.100000 | 24.0 | 18.0 | 6.0 | 6.0 | -18.0 | -38.0 |
2021-12-30 | 3508 | 13.000000 | 39.0 | 11.0 | 6.0 | 6.0 | -21.0 | -39.0 |
2021-12-31 | 2816 | 14.600000 | 15.0 | -24.0 | 12.0 | 12.0 | -44.0 | -53.0 |
566 rows × 8 columns
data_sevilla.describe()
num_casos | tmed | mob_grocery_pharmacy | mob_parks | mob_residential | mob_residential | mob_transit_stations | mob_workplaces | |
---|---|---|---|---|---|---|---|---|
count | 566.000000 | 566.000000 | 566.000000 | 566.000000 | 566.000000 | 566.000000 | 566.000000 | 566.000000 |
mean | 383.807420 | 20.691519 | -1.908127 | -16.176678 | 4.086572 | 4.086572 | -27.310954 | -21.436396 |
std | 454.714408 | 6.882766 | 18.249947 | 18.068323 | 4.554635 | 4.554635 | 14.360299 | 14.353391 |
min | 0.000000 | 5.500000 | -79.000000 | -70.000000 | -7.000000 | -7.000000 | -71.000000 | -79.000000 |
25% | 112.750000 | 15.000000 | -10.000000 | -28.000000 | 1.000000 | 1.000000 | -37.000000 | -30.000000 |
50% | 260.500000 | 20.800000 | -1.000000 | -13.000000 | 4.000000 | 4.000000 | -28.000000 | -18.000000 |
75% | 459.000000 | 26.575000 | 6.000000 | -5.000000 | 7.000000 | 7.000000 | -18.000000 | -11.000000 |
max | 3692.000000 | 34.400000 | 113.000000 | 61.000000 | 22.000000 | 22.000000 | 16.000000 | 9.000000 |
= data_sevilla.values np_data_sevilla
# Train dataset
= MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit_transform(np_data_sevilla)
scaled_data_sevilla print(f'Longitud del conjunto de datos disponible: {len(scaled_data_sevilla)}')
Longitud del conjunto de datos disponible: 566
# Since we are going to predict future values based on the 90 past elements,
# we need to create a list with those historic information for each element
= 90
historic_values = []
scaled_data_sevilla_x = []
scaled_data_sevilla_y
for i in range(historic_values, len(scaled_data_sevilla)):
-historic_values):i, :])
scaled_data_sevilla_x.append(scaled_data_sevilla[(i0])
scaled_data_sevilla_y.append(scaled_data_sevilla[i,
# Convert the x_train and y_train to numpy arrays
= np.array(scaled_data_sevilla_x)
scaled_data_sevilla_x = np.array(scaled_data_sevilla_y) scaled_data_sevilla_y
# Once predicted, we are going to need a exclusive scaler for num_cases
= MinMaxScaler(feature_range=(0, 1))
scaler_pred = pd.DataFrame(data_sevilla['num_casos'])
df_cases_sevilla = scaler_pred.fit_transform(df_cases_sevilla) scaled_data_sevilla_pred
# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_sevilla_y)}')
Longitud datos de entrenamiento con historico: 476
# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 90 days
= scaled_data_sevilla_x[0:len(scaled_data_sevilla_x)-91]
x_train = scaled_data_sevilla_y[0:len(scaled_data_sevilla_y)-91]
y_train print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')
= scaled_data_sevilla_x[len(scaled_data_sevilla_x)-90:len(scaled_data_sevilla_x)]
x_test = scaled_data_sevilla_y[len(scaled_data_sevilla_y)-90:len(scaled_data_sevilla_y)]
y_test print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')
Cantidad datos de entrenamiento: x=385 - y=385
Cantidad datos de test: x=90 - y=90
# Reshape the data to feed de recurrent network
= np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 8))
x_train print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
= np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 8))
x_test print("Test data shape:")
print(x_test.shape)
print(y_test.shape)
Train data shape:
(385, 90, 8)
(385,)
Test data shape:
(90, 90, 8)
(90,)
# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
= Sequential()
model # Model with Neurons
# Inputshape = neurons -> Timestamps
= x_train.shape[1]
neurons90,
model.add(LSTM(= 'relu',
activation = True,
return_sequences = (x_train.shape[1], 8)))
input_shape 50,
model.add(LSTM(= 'relu',
activation = True))
return_sequences 25,
model.add(LSTM(= 'relu',
activation = False))
return_sequences 5, activation = 'relu'))
model.add(Dense(1)) model.add(Dense(
Build model...
compile(optimizer='adam', loss='mean_squared_error')
model. model.summary()
Model: "sequential_9"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm_27 (LSTM) (None, 90, 90) 35640
lstm_28 (LSTM) (None, 90, 50) 28200
lstm_29 (LSTM) (None, 25) 7600
dense_18 (Dense) (None, 5) 130
dense_19 (Dense) (None, 1) 6
=================================================================
Total params: 71,576
Trainable params: 71,576
Non-trainable params: 0
_________________________________________________________________
# Training the model
# fit network
= model.fit(x_train,
history
y_train, =1000,
batch_size=50,
epochs= (x_test, y_test),
validation_data = 0) verbose
'loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.plot(history.history[
plt.legend() plt.show()
# Get the predicted values
= model.predict(x_test)
predictions = scaler_pred.inverse_transform(predictions) predictions
1/3 [=========>....................] - ETA: 0s
3/3 [==============================] - ETA: 0s
3/3 [==============================] - 0s 31ms/step
= y_test.reshape(-1,1)
y_test = scaler_pred.inverse_transform(y_test) y_test
# Calculate the mean absolute error (MAE)
= mean_absolute_error(y_test, predictions)
mae print('MAE: ' + str(round(mae, 1)))
# Calculate the root mean squarred error (RMSE)
= np.sqrt(mean_squared_error(y_test,predictions))
rmse print('RMSE: ' + str(round(rmse, 1)))
# Calculate the root mean squarred error (RMSE)
= mean_squared_error(y_test,
rmse
predictions,= False)
squared print('RMSE: ' + str(round(rmse, 1)))
MAE: 502.2
RMSE: 535.4
RMSE: 535.4
# Add the difference between the valid and predicted prices
= data_sevilla[:(len(x_train)+92)]
train = data_sevilla[(len(x_train)+91):] valid
1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True) valid.insert(
# Zoom-in to a closer timeframe
# Date from which on the date is displayed
= "2021-07-15"
display_start_date = valid[valid.index > display_start_date]
valid = train[train.index > display_start_date] train
# Visualize the data
'ggplot')
matplotlib.style.use(= plt.subplots(figsize=(22, 10), sharex=True)
fig, ax1
# Data - Train
= train.index;
xt = train[["num_casos"]]
yt # Data - Test / validation
= valid.index;
xv = valid[["num_casos", "Predictions"]]
yv
# Plot
"sevilla: Predictions vs Real infections", fontsize=20)
plt.title("Nº Cases", fontsize=18)
plt.ylabel(
="blue", linewidth=1.5)
plt.plot(yt, color"Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.plot(yv["Train", "LSTM Predictions", "Test"],
plt.legend([="upper left", fontsize=18)
loc
# Bar plot with the differences
= valid.index
x = valid["Difference"]
y =0.2, color="grey")
plt.bar(x, y, width
plt.grid() plt.show()