LSTM univariate prediction

Import python packages:

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

from keras import Input
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.layers.core import Dense, Activation, Dropout
data_covid = pd.read_csv('data/clean/final_covid_data.csv')
data_covid
provincia fecha num_casos num_casos_prueba_pcr num_casos_prueba_test_ac num_casos_prueba_ag num_casos_prueba_elisa num_casos_prueba_desconocida num_hosp num_uci ... ws ws_max sol mob_grocery_pharmacy mob_parks mob_residential mob_retail_recreation mob_transit_stations mob_workplaces mob_flujo
0 Barcelona 2020-01-01 0 0 0 0 0 0 0 0 ... 2.5 7.2 4.9 NaN NaN NaN NaN NaN NaN NaN
1 Madrid 2020-01-01 1 1 0 0 0 0 1 0 ... 0.8 3.6 8.3 NaN NaN NaN NaN NaN NaN NaN
2 Málaga 2020-01-01 0 0 0 0 0 0 0 0 ... 3.3 6.7 7.7 NaN NaN NaN NaN NaN NaN NaN
3 Asturias 2020-01-01 0 0 0 0 0 0 0 0 ... 2.5 7.8 7.9 NaN NaN NaN NaN NaN NaN NaN
4 Sevilla 2020-01-01 0 0 0 0 0 0 1 0 ... 1.9 5.8 9.1 NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
4090 Barcelona 2022-03-29 0 0 0 0 0 0 0 0 ... 7.2 13.3 0.0 0.0 -13.0 5.0 -24.0 -16.0 -17.0 NaN
4091 Madrid 2022-03-29 6 3 0 3 0 0 0 0 ... 2.2 6.1 2.4 1.0 -11.0 4.0 -25.0 -16.0 -16.0 NaN
4092 Málaga 2022-03-29 0 0 0 0 0 0 0 0 ... 4.2 10.8 1.5 4.0 -3.0 4.0 -16.0 1.0 -8.0 NaN
4093 Asturias 2022-03-29 0 0 0 0 0 0 1 0 ... 2.2 6.7 0.0 -4.0 17.0 3.0 -25.0 -15.0 -12.0 NaN
4094 Sevilla 2022-03-29 0 0 0 0 0 0 0 0 ... 2.5 8.3 1.6 3.0 -7.0 2.0 -15.0 -13.0 -5.0 NaN

4095 rows × 26 columns

All the available data

We will only detele data from the first wave since it is not reliable.

Asturias

data_asturias = data_covid.loc[data_covid['provincia'] == 'Asturias']
data_asturias = data_asturias.set_index('fecha')
data_asturias = data_asturias.filter(['num_casos'])
data_asturias = data_asturias['2020-06-14':]
data_asturias
num_casos
fecha
2020-06-14 0
2020-06-15 0
2020-06-16 1
2020-06-17 0
2020-06-18 0
... ...
2022-03-25 244
2022-03-26 432
2022-03-27 1
2022-03-28 9
2022-03-29 0

654 rows × 1 columns

data_asturias.describe()
num_casos
count 654.00000
mean 312.11315
std 565.17985
min 0.00000
25% 43.50000
50% 117.00000
75% 323.75000
max 3827.00000
np_data_asturias = data_asturias.values
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data_asturias = scaler.fit_transform(np_data_asturias)
print(f'Longitud del conjunto de datos disponible: {len(scaled_data_asturias)}')
Longitud del conjunto de datos disponible: 654
# Since we are going to predict future values based on the X past elements, 
# we need to create a list with those historic information for each element
historic_values = 90
scaled_data_asturias_x = []
scaled_data_asturias_y = []

for num_casos_i in range(historic_values, len(scaled_data_asturias)):
    scaled_data_asturias_x.append(scaled_data_asturias[(num_casos_i-historic_values):num_casos_i, 0])
    scaled_data_asturias_y.append(scaled_data_asturias[num_casos_i, 0])

# Convert the x_train and y_train to numpy arrays
scaled_data_asturias_x = np.array(scaled_data_asturias_x)
scaled_data_asturias_y = np.array(scaled_data_asturias_y)
# Train data looks like
scaled_data_asturias_x[235]
array([0.07002874, 0.0611445 , 0.07107395, 0.07394826, 0.06689313,
       0.05644108, 0.05278286, 0.03841129, 0.03501437, 0.04285341,
       0.04990854, 0.05016985, 0.04285341, 0.03710478, 0.03057225,
       0.03240136, 0.03945649, 0.03971779, 0.03762738, 0.04311471,
       0.02926574, 0.02691403, 0.03945649, 0.03919519, 0.03396917,
       0.04206951, 0.03684348, 0.03841129, 0.03475307, 0.02795924,
       0.02560753, 0.02743663, 0.03788869, 0.03814999, 0.02769794,
       0.03031095, 0.02822054, 0.03893389, 0.03240136, 0.03553697,
       0.03971779, 0.01802979, 0.01646198, 0.03004965, 0.03109485,
       0.02926574, 0.04468252, 0.03266266, 0.02482362, 0.03031095,
       0.03109485, 0.03893389, 0.03579828, 0.02325581, 0.03841129,
       0.0195976 , 0.02508492, 0.02456232, 0.03057225, 0.03527567,
       0.03919519, 0.03605958, 0.02534622, 0.02926574, 0.03240136,
       0.04807944, 0.0399791 , 0.03396917, 0.02743663, 0.02325581,
       0.02613013, 0.03083355, 0.03788869, 0.02848184, 0.03344656,
       0.03396917, 0.02822054, 0.02247191, 0.02351712, 0.02586883,
       0.02534622, 0.02874314, 0.0198589 , 0.0211654 , 0.01254246,
       0.01907499, 0.01228116, 0.01228116, 0.01881369, 0.01515547])
# Test data looks like
scaled_data_asturias_y[235]
0.013326365299189964
# Since the first 90 values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_asturias_y)}')
Longitud datos de entrenamiento con historico: 564
# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 21 days
x_train = scaled_data_asturias_x[0:len(scaled_data_asturias_x)-91]
y_train = scaled_data_asturias_y[0:len(scaled_data_asturias_y)-91]
print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')

x_test = scaled_data_asturias_x[len(scaled_data_asturias_x)-90:len(scaled_data_asturias_x)]
y_test = scaled_data_asturias_y[len(scaled_data_asturias_y)-90:len(scaled_data_asturias_y)]
print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')
Cantidad datos de entrenamiento: x=473 - y=473
Cantidad datos de test: x=90 - y=90
# Reshape the data to feed de recurrent network
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
print("Test data shape:")
print(x_test.shape)
print(y_test.shape)
Train data shape:
(473, 90, 1)
(473,)
Test data shape:
(90, 90, 1)
(90,)
# # Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
model = Sequential()
# Model with Neurons 
# Inputshape = neurons -> Timestamps
neurons= x_train.shape[1]
model.add(LSTM(90, 
               activation = 'relu',
               return_sequences = True, 
               input_shape = (x_train.shape[1], 1))) 
model.add(LSTM(50, 
               activation = 'relu',
               return_sequences = True)) 
model.add(LSTM(25, 
               activation = 'relu',
               return_sequences = False)) 
model.add(Dense(5, activation = 'relu'))
model.add(Dense(1))
Build model...
2022-05-25 01:22:54.266575: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 lstm (LSTM)                 (None, 90, 90)            33120     
                                                                 
 lstm_1 (LSTM)               (None, 90, 50)            28200     
                                                                 
 lstm_2 (LSTM)               (None, 25)                7600      
                                                                 
 dense (Dense)               (None, 5)                 130       
                                                                 
 dense_1 (Dense)             (None, 1)                 6         
                                                                 
=================================================================
Total params: 69,056
Trainable params: 69,056
Non-trainable params: 0
_________________________________________________________________
# Training the model
# # fit network
history = model.fit(x_train, 
                    y_train, 
                    batch_size=1000, 
                    epochs = 30, 
                    validation_data = (x_test, y_test), 
                    verbose = 0)
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show() 

# Get the predicted values
predictions = model.predict(x_test)
predictions = scaler.inverse_transform(predictions)
1/3 [=========>....................] - ETA: 0s
3/3 [==============================] - ETA: 0s
3/3 [==============================] - 0s 31ms/step
y_test = y_test.reshape(-1,1)
y_test = scaler.inverse_transform(y_test)
# Calculate the mean absolute error (MAE)
mae = mean_absolute_error(y_test, predictions)
print('MAE: ' + str(round(mae, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test,predictions))
print('RMSE: ' + str(round(rmse, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = mean_squared_error(y_test, 
                          predictions,
                          squared = False)
print('RMSE: ' + str(round(rmse, 1)))
MAE: 1074.0
RMSE: 1479.1
RMSE: 1479.1
# Add the difference between the valid and predicted prices
train = data_asturias[:(len(x_train)+92)]
valid = data_asturias[(len(x_train)+91):]
valid.insert(1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True)
# Zoom-in to a closer timeframe
# Date from which on the date is displayed
display_start_date = "2021-10-31" 
valid = valid[valid.index > display_start_date]
train = train[train.index > display_start_date]
# Visualize the data
matplotlib.style.use('ggplot')
fig, ax1 = plt.subplots(figsize=(22, 10), sharex=True)

# Data - Train
xt = train.index; 
yt = train[["num_casos"]]
# Data - Test / validation 
xv = valid.index; 
yv = valid[["num_casos", "Predictions"]]

# Plot
plt.title("Asturias: Predictions vs Real infections", fontsize=20)
plt.ylabel("Nº Cases", fontsize=18)

plt.plot(yt, color="blue", linewidth=1.5)
plt.plot(yv["Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.legend(["Train", "LSTM Predictions", "Test"], 
           loc="upper left", fontsize=18)

# Bar plot with the differences
x = valid.index
y = valid["Difference"]
plt.bar(x, y, width=0.2, color="grey")
plt.grid()
plt.show()

Barcelona

data_Barcelona = data_covid.loc[data_covid['provincia'] == 'Barcelona']
data_Barcelona = data_Barcelona.set_index('fecha')
data_Barcelona = data_Barcelona.filter(['num_casos'])
data_Barcelona = data_Barcelona['2020-06-14':]
data_Barcelona
num_casos
fecha
2020-06-14 33
2020-06-15 62
2020-06-16 66
2020-06-17 70
2020-06-18 68
... ...
2022-03-25 598
2022-03-26 345
2022-03-27 252
2022-03-28 688
2022-03-29 0

654 rows × 1 columns

data_Barcelona.describe()
num_casos
count 654.000000
mean 2605.048930
std 4823.001644
min 0.000000
25% 597.250000
50% 1030.500000
75% 2243.750000
max 34701.000000
np_data_Barcelona = data_Barcelona.values
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data_Barcelona = scaler.fit_transform(np_data_Barcelona)
print(f'Longitud del conjunto de datos disponible: {len(scaled_data_Barcelona)}')
Longitud del conjunto de datos disponible: 654
# Since we are going to predict future values based on the 90 past elements, 
# we need to create a list with those historic information for each element
historic_values = 90
scaled_data_Barcelona_x = []
scaled_data_Barcelona_y = []

for num_casos_i in range(historic_values, len(scaled_data_Barcelona)):
    scaled_data_Barcelona_x.append(scaled_data_Barcelona[(num_casos_i-historic_values):num_casos_i, 0])
    scaled_data_Barcelona_y.append(scaled_data_Barcelona[num_casos_i, 0])

# Convert the x_train and y_train to numpy arrays
scaled_data_Barcelona_x = np.array(scaled_data_Barcelona_x)
scaled_data_Barcelona_y = np.array(scaled_data_Barcelona_y)
# Train data looks like
scaled_data_Barcelona_x[235]
array([0.05037319, 0.04726089, 0.02899052, 0.02507132, 0.04639636,
       0.04190081, 0.03749171, 0.03527276, 0.03786634, 0.02619521,
       0.02351517, 0.04077692, 0.03760699, 0.03587793, 0.0364831 ,
       0.03120948, 0.02449497, 0.02083513, 0.03792398, 0.03613729,
       0.03195873, 0.02754964, 0.02818363, 0.02025878, 0.01752111,
       0.03484049, 0.03037376, 0.02870234, 0.0233999 , 0.02610876,
       0.01876027, 0.01645486, 0.0282989 , 0.02625285, 0.02680038,
       0.02564768, 0.02596467, 0.01953834, 0.01746347, 0.03184346,
       0.02973978, 0.02573413, 0.02838535, 0.02919224, 0.02190139,
       0.01890435, 0.03720354, 0.03642546, 0.0325639 , 0.0304314 ,
       0.03556093, 0.02524423, 0.02149794, 0.03803925, 0.03426414,
       0.03048903, 0.03365897, 0.02218956, 0.02530186, 0.02533068,
       0.02832771, 0.03835624, 0.03740526, 0.03512867, 0.03680009,
       0.02855825, 0.02118095, 0.03878851, 0.03507104, 0.03985476,
       0.03155529, 0.03394715, 0.02423561, 0.02169966, 0.03953777,
       0.04596409, 0.0350134 , 0.0395954 , 0.03267917, 0.02772254,
       0.02380335, 0.04668453, 0.04213135, 0.03806807, 0.03671364,
       0.03509985, 0.02106568, 0.01694476, 0.03803925, 0.02815481])
# Test data looks like
scaled_data_Barcelona_y[235]
0.030661940578081325
# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_Barcelona_y)}')
Longitud datos de entrenamiento con historico: 564
# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 90 days
x_train = scaled_data_Barcelona_x[0:len(scaled_data_Barcelona_x)-91]
y_train = scaled_data_Barcelona_y[0:len(scaled_data_Barcelona_y)-91]
print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')

x_test = scaled_data_Barcelona_x[len(scaled_data_Barcelona_x)-90:len(scaled_data_Barcelona_x)]
y_test = scaled_data_Barcelona_y[len(scaled_data_Barcelona_y)-90:len(scaled_data_Barcelona_y)]
print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')
Cantidad datos de entrenamiento: x=473 - y=473
Cantidad datos de test: x=90 - y=90
# Reshape the data to feed de recurrent network
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
print("Test data shape:")
print(x_test.shape)
print(y_test.shape)
Train data shape:
(473, 90, 1)
(473,)
Test data shape:
(90, 90, 1)
(90,)
# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
model = Sequential()
# Model with Neurons 
# Inputshape = neurons -> Timestamps
neurons= x_train.shape[1]
model.add(LSTM(90, 
               activation = 'relu',
               return_sequences = True, 
               input_shape = (x_train.shape[1], 1))) 
model.add(LSTM(50, 
               activation = 'relu',
               return_sequences = True)) 
model.add(LSTM(25, 
               activation = 'relu',
               return_sequences = False)) 
model.add(Dense(5, activation = 'relu'))
model.add(Dense(1))
Build model...
model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 lstm_3 (LSTM)               (None, 90, 90)            33120     
                                                                 
 lstm_4 (LSTM)               (None, 90, 50)            28200     
                                                                 
 lstm_5 (LSTM)               (None, 25)                7600      
                                                                 
 dense_2 (Dense)             (None, 5)                 130       
                                                                 
 dense_3 (Dense)             (None, 1)                 6         
                                                                 
=================================================================
Total params: 69,056
Trainable params: 69,056
Non-trainable params: 0
_________________________________________________________________
# Training the model
# fit network
history = model.fit(x_train, 
                    y_train, 
                    batch_size = 1000, 
                    epochs = 30, 
                    validation_data = (x_test, y_test), 
                    verbose = 0)
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show() 

# Get the predicted values
predictions = model.predict(x_test)
predictions = scaler.inverse_transform(predictions)
1/3 [=========>....................] - ETA: 0s
3/3 [==============================] - ETA: 0s
3/3 [==============================] - 0s 31ms/step
y_test = y_test.reshape(-1,1)
y_test = scaler.inverse_transform(y_test)
# Calculate the mean absolute error (MAE)
mae = mean_absolute_error(y_test, predictions)
print('MAE: ' + str(round(mae, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test,predictions))
print('RMSE: ' + str(round(rmse, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = mean_squared_error(y_test, 
                          predictions,
                          squared = False)
print('RMSE: ' + str(round(rmse, 1)))
MAE: 5228.7
RMSE: 7609.8
RMSE: 7609.8
# Add the difference between the valid and predicted prices
train = data_Barcelona[:(len(x_train)+92)]
valid = data_Barcelona[(len(x_train)+91):]
valid.insert(1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True)
# Zoom-in to a closer timeframe
# Date from which on the date is displayed
display_start_date = "2021-10-31" 
valid = valid[valid.index > display_start_date]
train = train[train.index > display_start_date]
# Visualize the data
matplotlib.style.use('ggplot')
fig, ax1 = plt.subplots(figsize=(22, 10), sharex=True)

# Data - Train
xt = train.index; 
yt = train[["num_casos"]]
# Data - Test / validation 
xv = valid.index; 
yv = valid[["num_casos", "Predictions"]]

# Plot
plt.title("Barcelona: Predictions vs Real infections", fontsize=20)
plt.ylabel("Nº Cases", fontsize=18)

plt.plot(yt, color="blue", linewidth=1.5)
plt.plot(yv["Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.legend(["Train", "LSTM Predictions", "Test"], 
           loc="upper left", fontsize=18)

# Bar plot with the differences
x = valid.index
y = valid["Difference"]
plt.bar(x, y, width=0.2, color="grey")
plt.grid()
plt.show()

Madrid

data_Madrid = data_covid.loc[data_covid['provincia'] == 'Madrid']
data_Madrid = data_Madrid.set_index('fecha')
data_Madrid = data_Madrid.filter(['num_casos'])
data_Madrid = data_Madrid['2020-06-14':]
data_Madrid
num_casos
fecha
2020-06-14 81
2020-06-15 153
2020-06-16 91
2020-06-17 93
2020-06-18 85
... ...
2022-03-25 356
2022-03-26 303
2022-03-27 77
2022-03-28 839
2022-03-29 6

654 rows × 1 columns

data_Madrid.describe()
num_casos
count 654.000000
mean 2392.562691
std 3390.272836
min 6.000000
25% 662.750000
50% 1413.000000
75% 2475.750000
max 23811.000000
np_data_Madrid = data_Madrid.values
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data_Madrid = scaler.fit_transform(np_data_Madrid)
print(f'Longitud del conjunto de datos disponible: {len(scaled_data_Madrid)}')
Longitud del conjunto de datos disponible: 654
# Since we are going to predict future values based on the X past elements, 
# we need to create a list with those historic information for each element
historic_values = 90
scaled_data_Madrid_x = []
scaled_data_Madrid_y = []

for num_casos_i in range(historic_values, len(scaled_data_Madrid)):
    scaled_data_Madrid_x.append(scaled_data_Madrid[(num_casos_i-historic_values):num_casos_i, 0])
    scaled_data_Madrid_y.append(scaled_data_Madrid[num_casos_i, 0])

# Convert the x_train and y_train to numpy arrays
scaled_data_Madrid_x = np.array(scaled_data_Madrid_x)
scaled_data_Madrid_y = np.array(scaled_data_Madrid_y)
# Train data looks like
scaled_data_Madrid_x[235]
array([0.11510187, 0.14820416, 0.08880487, 0.06481832, 0.11169922,
       0.11354757, 0.089477  , 0.0694812 , 0.09300567, 0.06133165,
       0.04709095, 0.08447805, 0.07746272, 0.06595253, 0.05708885,
       0.06515438, 0.04835119, 0.04377232, 0.06372611, 0.05931527,
       0.05141777, 0.04541063, 0.05330813, 0.04150389, 0.04032766,
       0.05486242, 0.05225793, 0.04822516, 0.04259609, 0.05246797,
       0.03919345, 0.03860534, 0.05561857, 0.05452636, 0.04814115,
       0.04448645, 0.05288805, 0.04125184, 0.03994959, 0.05448435,
       0.05183785, 0.0531401 , 0.05713085, 0.0431842 , 0.05061962,
       0.0478891 , 0.07292586, 0.07830288, 0.0626339 , 0.05902121,
       0.07485822, 0.05326612, 0.05335014, 0.07111951, 0.07095148,
       0.07784079, 0.05469439, 0.06166772, 0.07149758, 0.07456417,
       0.09880277, 0.09850872, 0.0857803 , 0.08178954, 0.08951901,
       0.07225373, 0.05977736, 0.09750053, 0.09762655, 0.08930897,
       0.08544423, 0.09489603, 0.07187566, 0.06406217, 0.09329973,
       0.09855072, 0.08435203, 0.07653854, 0.08905692, 0.0647343 ,
       0.05486242, 0.08237765, 0.0873766 , 0.07326192, 0.06259189,
       0.07485822, 0.04654484, 0.04494854, 0.04721697, 0.06700273])
# Test data looks like
scaled_data_Madrid_y[235]
0.0648183154799412
# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_Madrid_y)}')
Longitud datos de entrenamiento con historico: 564
# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 91 days
x_train = scaled_data_Madrid_x[0:len(scaled_data_Madrid_x)-91]
y_train = scaled_data_Madrid_y[0:len(scaled_data_Madrid_y)-91]
print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')

x_test = scaled_data_Madrid_x[len(scaled_data_Madrid_x)-90:len(scaled_data_Madrid_x)]
y_test = scaled_data_Madrid_y[len(scaled_data_Madrid_y)-90:len(scaled_data_Madrid_y)]
print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')
Cantidad datos de entrenamiento: x=473 - y=473
Cantidad datos de test: x=90 - y=90
# Reshape the data to feed de recurrent network
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
print("Test data shape:")
print(x_test.shape)
print(y_test.shape)
Train data shape:
(473, 90, 1)
(473,)
Test data shape:
(90, 90, 1)
(90,)
# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
model = Sequential()
# Model with Neurons 
# Inputshape = neurons -> Timestamps
neurons= x_train.shape[1]
model.add(LSTM(90, 
               activation = 'relu',
               return_sequences = True, 
               input_shape = (x_train.shape[1], 1))) 
model.add(LSTM(50, 
               activation = 'relu',
               return_sequences = True)) 
model.add(LSTM(25, 
               activation = 'relu',
               return_sequences = False)) 
model.add(Dense(5, activation = 'relu'))
model.add(Dense(1))
Build model...
model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 lstm_6 (LSTM)               (None, 90, 90)            33120     
                                                                 
 lstm_7 (LSTM)               (None, 90, 50)            28200     
                                                                 
 lstm_8 (LSTM)               (None, 25)                7600      
                                                                 
 dense_4 (Dense)             (None, 5)                 130       
                                                                 
 dense_5 (Dense)             (None, 1)                 6         
                                                                 
=================================================================
Total params: 69,056
Trainable params: 69,056
Non-trainable params: 0
_________________________________________________________________
# Training the model
# fit network
history = model.fit(x_train, 
                    y_train, 
                    batch_size = 1000, 
                    epochs = 30, 
                    validation_data = (x_test, y_test), 
                    verbose = 0)
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show() 

# Get the predicted values
predictions = model.predict(x_test)
predictions = scaler.inverse_transform(predictions)
1/3 [=========>....................] - ETA: 0s
3/3 [==============================] - ETA: 0s
3/3 [==============================] - 0s 30ms/step
y_test = y_test.reshape(-1,1)
y_test = scaler.inverse_transform(y_test)
# Calculate the mean absolute error (MAE)
mae = mean_absolute_error(y_test, predictions)
print('MAE: ' + str(round(mae, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test,predictions))
print('RMSE: ' + str(round(rmse, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = mean_squared_error(y_test, 
                          predictions,
                          squared = False)
print('RMSE: ' + str(round(rmse, 1)))
MAE: 4748.2
RMSE: 6975.8
RMSE: 6975.8
# Add the difference between the valid and predicted prices
train = data_Madrid[:(len(x_train)+92)]
valid = data_Madrid[(len(x_train)+91):]
valid.insert(1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True)
# Zoom-in to a closer timeframe
# Date from which on the date is displayed
display_start_date = "2021-10-31" 
valid = valid[valid.index > display_start_date]
train = train[train.index > display_start_date]
# Visualize the data
matplotlib.style.use('ggplot')
fig, ax1 = plt.subplots(figsize=(22, 10), sharex=True)

# Data - Train
xt = train.index; 
yt = train[["num_casos"]]
# Data - Test / validation 
xv = valid.index; 
yv = valid[["num_casos", "Predictions"]]

# Plot
plt.title("Madrid: Predictions vs Real infections", fontsize=20)
plt.ylabel("Nº Cases", fontsize=18)

plt.plot(yt, color="blue", linewidth=1.5)
plt.plot(yv["Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.legend(["Train", "LSTM Predictions", "Test"], 
           loc="upper left", fontsize=18)

# Bar plot with the differences
x = valid.index
y = valid["Difference"]
plt.bar(x, y, width=0.2, color="grey")
plt.grid()
plt.show()

Malaga

data_Malaga = data_covid.loc[data_covid['provincia'] == 'Málaga']
data_Malaga = data_Malaga.set_index('fecha')
data_Malaga = data_Malaga.filter(['num_casos'])
data_Malaga = data_Malaga['2020-06-14':]
data_Malaga
num_casos
fecha
2020-06-14 2
2020-06-15 1
2020-06-16 1
2020-06-17 2
2020-06-18 2
... ...
2022-03-25 565
2022-03-26 79
2022-03-27 65
2022-03-28 39
2022-03-29 0

654 rows × 1 columns

data_Malaga.describe()
num_casos
count 654.000000
mean 410.206422
std 489.452348
min 0.000000
25% 122.250000
50% 215.500000
75% 475.250000
max 3080.000000
np_data_Malaga = data_Malaga.values
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data_Malaga = scaler.fit_transform(np_data_Malaga)
print(f'Longitud del conjunto de datos disponible: {len(scaled_data_Malaga)}')
Longitud del conjunto de datos disponible: 654
# Since we are going to predict future values based on the X past elements, 
# we need to create a list with those historic information for each element
historic_values = 90
scaled_data_Malaga_x = []
scaled_data_Malaga_y = []

for num_casos_i in range(historic_values, len(scaled_data_Malaga)):
    scaled_data_Malaga_x.append(scaled_data_Malaga[(num_casos_i-historic_values):num_casos_i, 0])
    scaled_data_Malaga_y.append(scaled_data_Malaga[num_casos_i, 0])

# Convert the x_train and y_train to numpy arrays
scaled_data_Malaga_x = np.array(scaled_data_Malaga_x)
scaled_data_Malaga_y = np.array(scaled_data_Malaga_y)
# Train data looks like
scaled_data_Malaga_x[235]
array([0.24448052, 0.21915584, 0.15649351, 0.12045455, 0.15357143,
       0.14448052, 0.1474026 , 0.14675325, 0.11655844, 0.09448052,
       0.06753247, 0.09545455, 0.11071429, 0.06883117, 0.06980519,
       0.06688312, 0.04902597, 0.03474026, 0.05974026, 0.04935065,
       0.04967532, 0.04480519, 0.04188312, 0.03993506, 0.02954545,
       0.04188312, 0.04772727, 0.05162338, 0.04902597, 0.04318182,
       0.02792208, 0.0288961 , 0.04253247, 0.04123377, 0.04025974,
       0.03538961, 0.05097403, 0.03051948, 0.02662338, 0.04123377,
       0.03149351, 0.04350649, 0.03733766, 0.03571429, 0.02954545,
       0.0211039 , 0.04188312, 0.04545455, 0.03863636, 0.04772727,
       0.04935065, 0.03571429, 0.0288961 , 0.05357143, 0.05324675,
       0.05616883, 0.04253247, 0.04058442, 0.03474026, 0.04935065,
       0.06623377, 0.07824675, 0.06623377, 0.05909091, 0.06233766,
       0.05064935, 0.03116883, 0.07077922, 0.05649351, 0.07792208,
       0.06168831, 0.0711039 , 0.04155844, 0.04577922, 0.06623377,
       0.06720779, 0.05909091, 0.05519481, 0.05324675, 0.04383117,
       0.03668831, 0.05941558, 0.06525974, 0.06006494, 0.05551948,
       0.05876623, 0.04512987, 0.04545455, 0.06266234, 0.07727273])
# Test data looks like
scaled_data_Malaga_y[235]
0.0737012987012987
# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_Malaga_y)}')
Longitud datos de entrenamiento con historico: 564
# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 91 days
x_train = scaled_data_Malaga_x[0:len(scaled_data_Malaga_x)-91]
y_train = scaled_data_Malaga_y[0:len(scaled_data_Malaga_y)-91]
print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')

x_test = scaled_data_Malaga_x[len(scaled_data_Malaga_x)-90:len(scaled_data_Malaga_x)]
y_test = scaled_data_Malaga_y[len(scaled_data_Malaga_y)-90:len(scaled_data_Malaga_y)]
print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')
Cantidad datos de entrenamiento: x=473 - y=473
Cantidad datos de test: x=90 - y=90
# Reshape the data to feed de recurrent network
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
print("Test data shape:")
print(x_test.shape)
print(y_test.shape)
Train data shape:
(473, 90, 1)
(473,)
Test data shape:
(90, 90, 1)
(90,)
# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
model = Sequential()
# Model with Neurons 
# Inputshape = neurons -> Timestamps
neurons= x_train.shape[1]
model.add(LSTM(90, 
               activation = 'relu',
               return_sequences = True, 
               input_shape = (x_train.shape[1], 1))) 
model.add(LSTM(50, 
               activation = 'relu',
               return_sequences = True)) 
model.add(LSTM(25, 
               activation = 'relu',
               return_sequences = False)) 
model.add(Dense(5, activation = 'relu'))
model.add(Dense(1))
Build model...
model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()
Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 lstm_9 (LSTM)               (None, 90, 90)            33120     
                                                                 
 lstm_10 (LSTM)              (None, 90, 50)            28200     
                                                                 
 lstm_11 (LSTM)              (None, 25)                7600      
                                                                 
 dense_6 (Dense)             (None, 5)                 130       
                                                                 
 dense_7 (Dense)             (None, 1)                 6         
                                                                 
=================================================================
Total params: 69,056
Trainable params: 69,056
Non-trainable params: 0
_________________________________________________________________
# Training the model
# fit network
history = model.fit(x_train, 
                    y_train, 
                    batch_size = 1000, 
                    epochs = 30, 
                    validation_data = (x_test, y_test), 
                    verbose = 0)
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show() 

# Get the predicted values
predictions = model.predict(x_test)
predictions = scaler.inverse_transform(predictions)
1/3 [=========>....................] - ETA: 0s
3/3 [==============================] - ETA: 0s
3/3 [==============================] - 0s 31ms/step
y_test = y_test.reshape(-1,1)
y_test = scaler.inverse_transform(y_test)
# Calculate the mean absolute error (MAE)
mae = mean_absolute_error(y_test, predictions)
print('MAE: ' + str(round(mae, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test,predictions))
print('RMSE: ' + str(round(rmse, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = mean_squared_error(y_test, 
                          predictions,
                          squared = False)
print('RMSE: ' + str(round(rmse, 1)))
MAE: 353.5
RMSE: 481.1
RMSE: 481.1
# Add the difference between the valid and predicted prices
train = data_Malaga[:(len(x_train)+92)]
valid = data_Malaga[(len(x_train)+91):]
valid.insert(1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True)
# Zoom-in to a closer timeframe
# Date from which on the date is displayed
display_start_date = "2021-10-31" 
valid = valid[valid.index > display_start_date]
train = train[train.index > display_start_date]
# Visualize the data
matplotlib.style.use('ggplot')
fig, ax1 = plt.subplots(figsize=(22, 10), sharex=True)

# Data - Train
xt = train.index; 
yt = train[["num_casos"]]
# Data - Test / validation 
xv = valid.index; 
yv = valid[["num_casos", "Predictions"]]

# Plot
plt.title("Malaga: Predictions vs Real infections", fontsize=20)
plt.ylabel("Nº Cases", fontsize=18)

plt.plot(yt, color="blue", linewidth=1.5)
plt.plot(yv["Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.legend(["Train", "LSTM Predictions", "Test"], 
           loc="upper left", fontsize=18)

# Bar plot with the differences
x = valid.index
y = valid["Difference"]
plt.bar(x, y, width=0.2, color="grey")
plt.grid()
plt.show()

Sevilla

data_Sevilla = data_covid.loc[data_covid['provincia'] == 'Sevilla']
data_Sevilla = data_Sevilla.set_index('fecha')
data_Sevilla = data_Sevilla.filter(['num_casos'])
data_Sevilla = data_Sevilla['2020-06-14':]
data_Sevilla
num_casos
fecha
2020-06-14 0
2020-06-15 2
2020-06-16 1
2020-06-17 0
2020-06-18 0
... ...
2022-03-25 365
2022-03-26 67
2022-03-27 24
2022-03-28 12
2022-03-29 0

654 rows × 1 columns

data_Sevilla.describe()
num_casos
count 654.000000
mean 432.978593
std 500.618517
min 0.000000
25% 127.250000
50% 282.000000
75% 528.250000
max 3692.000000
np_data_Sevilla = data_Sevilla.values
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data_Sevilla = scaler.fit_transform(np_data_Sevilla)
print(f'Longitud del conjunto de datos disponible: {len(scaled_data_Sevilla)}')
Longitud del conjunto de datos disponible: 654
# Since we are going to predict future values based on the X past elements, 
# we need to create a list with those historic information for each element
historic_values = 90
scaled_data_Sevilla_x = []
scaled_data_Sevilla_y = []

for num_casos_i in range(historic_values, len(scaled_data_Sevilla)):
    scaled_data_Sevilla_x.append(scaled_data_Sevilla[(num_casos_i-historic_values):num_casos_i, 0])
    scaled_data_Sevilla_y.append(scaled_data_Sevilla[num_casos_i, 0])

# Convert the x_train and y_train to numpy arrays
scaled_data_Sevilla_x = np.array(scaled_data_Sevilla_x)
scaled_data_Sevilla_y = np.array(scaled_data_Sevilla_y)
# Train data looks like
scaled_data_Sevilla_x[235]
array([0.18634886, 0.19799567, 0.14626219, 0.0896533 , 0.13894908,
       0.1695558 , 0.14572048, 0.12242687, 0.13434453, 0.06500542,
       0.05390033, 0.08206934, 0.08667389, 0.09452871, 0.06798483,
       0.05904659, 0.04252438, 0.0343987 , 0.06554713, 0.06473456,
       0.05444204, 0.0503792 , 0.05796316, 0.04577465, 0.03819068,
       0.04712893, 0.0528169 , 0.0663597 , 0.06013001, 0.05119177,
       0.03764897, 0.031961  , 0.05119177, 0.05010834, 0.04739978,
       0.05119177, 0.05390033, 0.03927411, 0.03737811, 0.06013001,
       0.05633803, 0.05606717, 0.05850488, 0.06473456, 0.04658722,
       0.04198267, 0.07150596, 0.07340195, 0.05823402, 0.08071506,
       0.07773564, 0.05877573, 0.05471289, 0.07990249, 0.08829902,
       0.0872156 , 0.09019502, 0.07069339, 0.08775731, 0.08396533,
       0.14951246, 0.12432286, 0.12269772, 0.12378115, 0.13028169,
       0.11348862, 0.07936078, 0.13299025, 0.12269772, 0.11213434,
       0.12107259, 0.12432286, 0.09723727, 0.07042254, 0.09886241,
       0.11782232, 0.10157096, 0.10346696, 0.12621885, 0.08342362,
       0.07340195, 0.0847779 , 0.08559047, 0.11159263, 0.10292524,
       0.08911159, 0.07502709, 0.04821235, 0.08315276, 0.09913326])
# Test data looks like
scaled_data_Sevilla_y[235]
0.10861321776814734
# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_Sevilla_y)}')
Longitud datos de entrenamiento con historico: 564
# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 91 days
x_train = scaled_data_Sevilla_x[0:len(scaled_data_Sevilla_x)-91]
y_train = scaled_data_Sevilla_y[0:len(scaled_data_Sevilla_y)-91]
print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')

x_test = scaled_data_Sevilla_x[len(scaled_data_Sevilla_x)-90:len(scaled_data_Sevilla_x)]
y_test = scaled_data_Sevilla_y[len(scaled_data_Sevilla_y)-90:len(scaled_data_Sevilla_y)]
print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')
Cantidad datos de entrenamiento: x=473 - y=473
Cantidad datos de test: x=90 - y=90
# Reshape the data to feed de recurrent network
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
print("Test data shape:")
print(x_test.shape)
print(y_test.shape)
Train data shape:
(473, 90, 1)
(473,)
Test data shape:
(90, 90, 1)
(90,)
# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
model = Sequential()
# Model with Neurons 
# Inputshape = neurons -> Timestamps
neurons= x_train.shape[1]
model.add(LSTM(90, 
               activation = 'relu',
               return_sequences = True, 
               input_shape = (x_train.shape[1], 1))) 
model.add(LSTM(50, 
               activation = 'relu',
               return_sequences = True)) 
model.add(LSTM(25, 
               activation = 'relu',
               return_sequences = False)) 
model.add(Dense(5, activation = 'relu'))
model.add(Dense(1))
Build model...
model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()
Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 lstm_12 (LSTM)              (None, 90, 90)            33120     
                                                                 
 lstm_13 (LSTM)              (None, 90, 50)            28200     
                                                                 
 lstm_14 (LSTM)              (None, 25)                7600      
                                                                 
 dense_8 (Dense)             (None, 5)                 130       
                                                                 
 dense_9 (Dense)             (None, 1)                 6         
                                                                 
=================================================================
Total params: 69,056
Trainable params: 69,056
Non-trainable params: 0
_________________________________________________________________
# Training the model
# fit network
history = model.fit(x_train, 
                    y_train, 
                    batch_size = 1000, 
                    epochs = 30, 
                    validation_data = (x_test, y_test), 
                    verbose = 0)
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show() 

# Get the predicted values
predictions = model.predict(x_test)
predictions = scaler.inverse_transform(predictions)
WARNING:tensorflow:5 out of the last 13 calls to <function Model.make_predict_function.<locals>.predict_function at 0x7fdc69aa8c10> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has reduce_retracing=True option that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for  more details.
1/3 [=========>....................] - ETA: 0s
3/3 [==============================] - ETA: 0s
3/3 [==============================] - 0s 32ms/step
y_test = y_test.reshape(-1,1)
y_test = scaler.inverse_transform(y_test)
# Calculate the mean absolute error (MAE)
mae = mean_absolute_error(y_test, predictions)
print('MAE: ' + str(round(mae, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test,predictions))
print('RMSE: ' + str(round(rmse, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = mean_squared_error(y_test, 
                          predictions,
                          squared = False)
print('RMSE: ' + str(round(rmse, 1)))
MAE: 335.1
RMSE: 474.0
RMSE: 474.0
# Add the difference between the valid and predicted prices
train = data_Sevilla[:(len(x_train)+92)]
valid = data_Sevilla[(len(x_train)+91):]
valid.insert(1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True)
# Zoom-in to a closer timeframe
# Date from which on the date is displayed
display_start_date = "2021-10-31" 
valid = valid[valid.index > display_start_date]
train = train[train.index > display_start_date]
# Visualize the data
matplotlib.style.use('ggplot')
fig, ax1 = plt.subplots(figsize=(22, 10), sharex=True)

# Data - Train
xt = train.index; 
yt = train[["num_casos"]]
# Data - Test / validation 
xv = valid.index; 
yv = valid[["num_casos", "Predictions"]]

# Plot
plt.title("Sevilla: Predictions vs Real infections", fontsize=20)
plt.ylabel("Nº Cases", fontsize=18)

plt.plot(yt, color="blue", linewidth=1.5)
plt.plot(yv["Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.legend(["Train", "LSTM Predictions", "Test"], 
           loc="upper left", fontsize=18)

# Bar plot with the differences
x = valid.index
y = valid["Difference"]
plt.bar(x, y, width=0.2, color="grey")
plt.grid()
plt.show()

Just before the sixth wave

Asturias

data_asturias = data_covid.loc[data_covid['provincia'] == 'Asturias']
data_asturias = data_asturias.set_index('fecha')
data_asturias = data_asturias.filter(['num_casos'])
data_asturias = data_asturias['2020-06-14':'2021-12-31']
data_asturias
num_casos
fecha
2020-06-14 0
2020-06-15 0
2020-06-16 1
2020-06-17 0
2020-06-18 0
... ...
2021-12-27 2363
2021-12-28 2150
2021-12-29 2159
2021-12-30 2020
2021-12-31 1949

566 rows × 1 columns

data_asturias.describe()
num_casos
count 566.000000
mean 180.913428
std 276.250633
min 0.000000
25% 36.000000
50% 94.500000
75% 225.750000
max 2363.000000
np_data_asturias = data_asturias.values
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data_asturias = scaler.fit_transform(np_data_asturias)
print(f'Longitud del conjunto de datos disponible: {len(scaled_data_asturias)}')
Longitud del conjunto de datos disponible: 566
# Since we are going to predict future values based on the X past elements, 
# we need to create a list with those historic information for each element
historic_values = 90
scaled_data_asturias_x = []
scaled_data_asturias_y = []

for num_casos_i in range(historic_values, len(scaled_data_asturias)):
    scaled_data_asturias_x.append(scaled_data_asturias[(num_casos_i-historic_values):num_casos_i, 0])
    scaled_data_asturias_y.append(scaled_data_asturias[num_casos_i, 0])

# Convert the x_train and y_train to numpy arrays
scaled_data_asturias_x = np.array(scaled_data_asturias_x)
scaled_data_asturias_y = np.array(scaled_data_asturias_y)
# Since the first 90 values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_asturias_y)}')
Longitud datos de entrenamiento con historico: 476
# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 21 days
x_train = scaled_data_asturias_x[0:len(scaled_data_asturias_x)-91]
y_train = scaled_data_asturias_y[0:len(scaled_data_asturias_y)-91]
print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')

x_test = scaled_data_asturias_x[len(scaled_data_asturias_x)-90:len(scaled_data_asturias_x)]
y_test = scaled_data_asturias_y[len(scaled_data_asturias_y)-90:len(scaled_data_asturias_y)]
print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')
Cantidad datos de entrenamiento: x=385 - y=385
Cantidad datos de test: x=90 - y=90
# Reshape the data to feed de recurrent network
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
print("Test data shape:")
print(x_test.shape)
print(y_test.shape)
Train data shape:
(385, 90, 1)
(385,)
Test data shape:
(90, 90, 1)
(90,)
# # Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
model = Sequential()
# Model with Neurons 
# Inputshape = neurons -> Timestamps
neurons= x_train.shape[1]
model.add(LSTM(90, 
               activation = 'relu',
               return_sequences = True, 
               input_shape = (x_train.shape[1], 1))) 
model.add(LSTM(50, 
               activation = 'relu',
               return_sequences = True)) 
model.add(LSTM(25, 
               activation = 'relu',
               return_sequences = False)) 
model.add(Dense(5, activation = 'relu'))
model.add(Dense(1))
Build model...
model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()
Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 lstm_15 (LSTM)              (None, 90, 90)            33120     
                                                                 
 lstm_16 (LSTM)              (None, 90, 50)            28200     
                                                                 
 lstm_17 (LSTM)              (None, 25)                7600      
                                                                 
 dense_10 (Dense)            (None, 5)                 130       
                                                                 
 dense_11 (Dense)            (None, 1)                 6         
                                                                 
=================================================================
Total params: 69,056
Trainable params: 69,056
Non-trainable params: 0
_________________________________________________________________
# Training the model
# # fit network
history = model.fit(x_train, 
                    y_train, 
                    batch_size=1000, 
                    epochs=50, 
                    validation_data = (x_test, y_test), 
                    verbose = 0)
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show() 

# Get the predicted values
predictions = model.predict(x_test)
predictions = scaler.inverse_transform(predictions)
WARNING:tensorflow:5 out of the last 13 calls to <function Model.make_predict_function.<locals>.predict_function at 0x7fdc3a579c10> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has reduce_retracing=True option that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for  more details.
1/3 [=========>....................] - ETA: 0s
3/3 [==============================] - ETA: 0s
3/3 [==============================] - 0s 31ms/step
y_test = y_test.reshape(-1,1)
y_test = scaler.inverse_transform(y_test)
# Calculate the mean absolute error (MAE)
mae = mean_absolute_error(y_test, predictions)
print('MAE: ' + str(round(mae, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test,predictions))
print('RMSE: ' + str(round(rmse, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = mean_squared_error(y_test, 
                          predictions,
                          squared = False)
print('RMSE: ' + str(round(rmse, 1)))
MAE: 208.7
RMSE: 387.0
RMSE: 387.0
# Add the difference between the valid and predicted prices
train = data_asturias[:(len(x_train)+92)]
valid = data_asturias[(len(x_train)+91):]
valid.insert(1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True)
# Zoom-in to a closer timeframe
# Date from which on the date is displayed
display_start_date = "2021-07-15"
valid = valid[valid.index > display_start_date]
train = train[train.index > display_start_date]
# Visualize the data
matplotlib.style.use('ggplot')
fig, ax1 = plt.subplots(figsize=(22, 10), sharex=True)

# Data - Train
xt = train.index; 
yt = train[["num_casos"]]
# Data - Test / validation 
xv = valid.index; 
yv = valid[["num_casos", "Predictions"]]

# Plot
plt.title("Asturias: Predictions vs Real infections", fontsize=20)
plt.ylabel("Nº Cases", fontsize=18)

plt.plot(yt, color="blue", linewidth=1.5)
plt.plot(yv["Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.legend(["Train", "LSTM Predictions", "Test"], 
           loc="upper left", fontsize=18)

# Bar plot with the differences
x = valid.index
y = valid["Difference"]
plt.bar(x, y, width=0.2, color="grey")
plt.grid()
plt.show()

Barcelona

data_Barcelona = data_covid.loc[data_covid['provincia'] == 'Barcelona']
data_Barcelona = data_Barcelona.set_index('fecha')
data_Barcelona = data_Barcelona.filter(['num_casos'])
data_Barcelona = data_Barcelona['2020-06-14':'2021-12-31']
data_Barcelona
num_casos
fecha
2020-06-14 33
2020-06-15 62
2020-06-16 66
2020-06-17 70
2020-06-18 68
... ...
2021-12-27 19383
2021-12-28 20192
2021-12-29 19361
2021-12-30 17639
2021-12-31 16651

566 rows × 1 columns

data_Barcelona.describe()
num_casos
count 566.000000
mean 1575.480565
std 2281.290383
min 33.000000
25% 544.750000
50% 944.500000
75% 1609.500000
max 20192.000000
np_data_Barcelona = data_Barcelona.values
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data_Barcelona = scaler.fit_transform(np_data_Barcelona)
print(f'Longitud del conjunto de datos disponible: {len(scaled_data_Barcelona)}')
Longitud del conjunto de datos disponible: 566
# Since we are going to predict future values based on the 90 past elements, 
# we need to create a list with those historic information for each element
historic_values = 90
scaled_data_Barcelona_x = []
scaled_data_Barcelona_y = []

for num_casos_i in range(historic_values, len(scaled_data_Barcelona)):
    scaled_data_Barcelona_x.append(scaled_data_Barcelona[(num_casos_i-historic_values):num_casos_i, 0])
    scaled_data_Barcelona_y.append(scaled_data_Barcelona[num_casos_i, 0])

# Convert the x_train and y_train to numpy arrays
scaled_data_Barcelona_x = np.array(scaled_data_Barcelona_x)
scaled_data_Barcelona_y = np.array(scaled_data_Barcelona_y)
# Train data looks like
scaled_data_Barcelona_x[235]
array([0.08507366, 0.07971626, 0.04826628, 0.04151992, 0.07822809,
       0.07048961, 0.06289995, 0.05908031, 0.06354482, 0.04345454,
       0.03884121, 0.06855499, 0.06309837, 0.06012203, 0.06116375,
       0.05208592, 0.0405278 , 0.03422789, 0.06364403, 0.06056848,
       0.05337566, 0.045786  , 0.04687733, 0.03323578, 0.02852324,
       0.05833623, 0.05064735, 0.04777023, 0.03864279, 0.04330572,
       0.03065628, 0.02668783, 0.04707575, 0.04355375, 0.04449625,
       0.04251203, 0.04305769, 0.03199563, 0.02842403, 0.05317724,
       0.04955603, 0.04266085, 0.04722456, 0.04861352, 0.0360633 ,
       0.03090431, 0.06240389, 0.06106454, 0.05441738, 0.05074656,
       0.05957637, 0.04181755, 0.03536882, 0.06384245, 0.05734411,
       0.05084578, 0.0563024 , 0.03655935, 0.04191676, 0.04196637,
       0.04712535, 0.06438811, 0.06275113, 0.05883228, 0.06170941,
       0.0475222 , 0.03482316, 0.0651322 , 0.05873307, 0.06696761,
       0.05268118, 0.05679845, 0.04008135, 0.03571606, 0.06642195,
       0.077484  , 0.05863386, 0.06652116, 0.0546158 , 0.04608364,
       0.03933727, 0.07872414, 0.07088645, 0.06389206, 0.06156059,
       0.05878268, 0.03462473, 0.02753113, 0.06384245, 0.04682772])
# Test data looks like
scaled_data_Barcelona_y[235]
0.05114340989136366
# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_Barcelona_y)}')
Longitud datos de entrenamiento con historico: 476
# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 90 days
x_train = scaled_data_Barcelona_x[0:len(scaled_data_Barcelona_x)-91]
y_train = scaled_data_Barcelona_y[0:len(scaled_data_Barcelona_y)-91]
print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')

x_test = scaled_data_Barcelona_x[len(scaled_data_Barcelona_x)-90:len(scaled_data_Barcelona_x)]
y_test = scaled_data_Barcelona_y[len(scaled_data_Barcelona_y)-90:len(scaled_data_Barcelona_y)]
print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')
Cantidad datos de entrenamiento: x=385 - y=385
Cantidad datos de test: x=90 - y=90
# Reshape the data to feed de recurrent network
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
print("Test data shape:")
print(x_test.shape)
print(y_test.shape)
Train data shape:
(385, 90, 1)
(385,)
Test data shape:
(90, 90, 1)
(90,)
# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
model = Sequential()
# Model with Neurons 
# Inputshape = neurons -> Timestamps
neurons= x_train.shape[1]
model.add(LSTM(90, 
               activation = 'relu',
               return_sequences = True, 
               input_shape = (x_train.shape[1], 1))) 
model.add(LSTM(50, 
               activation = 'relu',
               return_sequences = True)) 
model.add(LSTM(25, 
               activation = 'relu',
               return_sequences = False)) 
model.add(Dense(5, activation = 'relu'))
model.add(Dense(1))
Build model...
model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()
Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 lstm_18 (LSTM)              (None, 90, 90)            33120     
                                                                 
 lstm_19 (LSTM)              (None, 90, 50)            28200     
                                                                 
 lstm_20 (LSTM)              (None, 25)                7600      
                                                                 
 dense_12 (Dense)            (None, 5)                 130       
                                                                 
 dense_13 (Dense)            (None, 1)                 6         
                                                                 
=================================================================
Total params: 69,056
Trainable params: 69,056
Non-trainable params: 0
_________________________________________________________________
# Training the model
# fit network
history = model.fit(x_train, 
                    y_train, 
                    batch_size = 1000, 
                    epochs = 50, 
                    validation_data = (x_test, y_test), 
                    verbose = 0)
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show() 

# Get the predicted values
predictions = model.predict(x_test)
predictions = scaler.inverse_transform(predictions)
1/3 [=========>....................] - ETA: 0s
3/3 [==============================] - ETA: 0s
3/3 [==============================] - 0s 31ms/step
y_test = y_test.reshape(-1,1)
y_test = scaler.inverse_transform(y_test)
# Calculate the mean absolute error (MAE)
mae = mean_absolute_error(y_test, predictions)
print('MAE: ' + str(round(mae, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test,predictions))
print('RMSE: ' + str(round(rmse, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = mean_squared_error(y_test, 
                          predictions,
                          squared = False)
print('RMSE: ' + str(round(rmse, 1)))
MAE: 1674.3
RMSE: 3525.8
RMSE: 3525.8
# Add the difference between the valid and predicted prices
train = data_Barcelona[:(len(x_train)+92)]
valid = data_Barcelona[(len(x_train)+91):]
valid.insert(1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True)
# Zoom-in to a closer timeframe
# Date from which on the date is displayed
display_start_date = "2021-07-15" 
valid = valid[valid.index > display_start_date]
train = train[train.index > display_start_date]
# Visualize the data
matplotlib.style.use('ggplot')
fig, ax1 = plt.subplots(figsize=(22, 10), sharex=True)

# Data - Train
xt = train.index; 
yt = train[["num_casos"]]
# Data - Test / validation 
xv = valid.index; 
yv = valid[["num_casos", "Predictions"]]

# Plot
plt.title("Barcelona: Predictions vs Real infections", fontsize=20)
plt.ylabel("Nº Cases", fontsize=18)

plt.plot(yt, color="blue", linewidth=1.5)
plt.plot(yv["Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.legend(["Train", "LSTM Predictions", "Test"], 
           loc="upper left", fontsize=18)

# Bar plot with the differences
x = valid.index
y = valid["Difference"]
plt.bar(x, y, width=0.2, color="grey")
plt.grid()
plt.show()

Madrid

data_Madrid = data_covid.loc[data_covid['provincia'] == 'Madrid']
data_Madrid = data_Madrid.set_index('fecha')
data_Madrid = data_Madrid.filter(['num_casos'])
data_Madrid = data_Madrid['2020-06-14':'2021-12-31']
data_Madrid
num_casos
fecha
2020-06-14 81
2020-06-15 153
2020-06-16 91
2020-06-17 93
2020-06-18 85
... ...
2021-12-27 22958
2021-12-28 23811
2021-12-29 21914
2021-12-30 20666
2021-12-31 7556

566 rows × 1 columns

data_Madrid.describe()
num_casos
count 566.000000
mean 1994.136042
std 2795.419848
min 28.000000
25% 550.250000
50% 1312.500000
75% 2282.500000
max 23811.000000
np_data_Madrid = data_Madrid.values
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data_Madrid = scaler.fit_transform(np_data_Madrid)
print(f'Longitud del conjunto de datos disponible: {len(scaled_data_Madrid)}')
Longitud del conjunto de datos disponible: 566
# Since we are going to predict future values based on the X past elements, 
# we need to create a list with those historic information for each element
historic_values = 90
scaled_data_Madrid_x = []
scaled_data_Madrid_y = []

for num_casos_i in range(historic_values, len(scaled_data_Madrid)):
    scaled_data_Madrid_x.append(scaled_data_Madrid[(num_casos_i-historic_values):num_casos_i, 0])
    scaled_data_Madrid_y.append(scaled_data_Madrid[num_casos_i, 0])

# Convert the x_train and y_train to numpy arrays
scaled_data_Madrid_x = np.array(scaled_data_Madrid_x)
scaled_data_Madrid_y = np.array(scaled_data_Madrid_y)
# Train data looks like
scaled_data_Madrid_x[235]
array([0.11428331, 0.14741622, 0.08796199, 0.06395324, 0.11087752,
       0.11272758, 0.08863474, 0.06862044, 0.09216667, 0.06046336,
       0.04620948, 0.08363117, 0.07660934, 0.06508851, 0.05621663,
       0.06428962, 0.04747088, 0.04288778, 0.06286003, 0.05844511,
       0.0505403 , 0.0445276 , 0.05243241, 0.04061725, 0.03943994,
       0.05398814, 0.05138124, 0.04734474, 0.04171047, 0.05159147,
       0.03830467, 0.03771602, 0.05474499, 0.05365177, 0.04726065,
       0.04360257, 0.05201194, 0.04036497, 0.03906151, 0.05360972,
       0.05096077, 0.05226422, 0.05625867, 0.04229912, 0.04974141,
       0.04700837, 0.07206828, 0.07745028, 0.06176681, 0.05815078,
       0.07400244, 0.05239036, 0.05247446, 0.07026027, 0.07009208,
       0.07698776, 0.05381996, 0.06079973, 0.07063869, 0.07370811,
       0.09796914, 0.09767481, 0.08493462, 0.08094017, 0.08867679,
       0.07139553, 0.05890762, 0.09666569, 0.09679183, 0.08846655,
       0.08459824, 0.09405878, 0.07101711, 0.0631964 , 0.092461  ,
       0.09771686, 0.08350502, 0.07568431, 0.08821427, 0.06386915,
       0.05398814, 0.08152882, 0.0865324 , 0.07240466, 0.06172476,
       0.07400244, 0.04566287, 0.04406509, 0.04633562, 0.06613968])
# Test data looks like
scaled_data_Madrid_y[235]
0.0639532439137199
# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_Madrid_y)}')
Longitud datos de entrenamiento con historico: 476
# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 91 days
x_train = scaled_data_Madrid_x[0:len(scaled_data_Madrid_x)-91]
y_train = scaled_data_Madrid_y[0:len(scaled_data_Madrid_y)-91]
print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')

x_test = scaled_data_Madrid_x[len(scaled_data_Madrid_x)-90:len(scaled_data_Madrid_x)]
y_test = scaled_data_Madrid_y[len(scaled_data_Madrid_y)-90:len(scaled_data_Madrid_y)]
print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')
Cantidad datos de entrenamiento: x=385 - y=385
Cantidad datos de test: x=90 - y=90
# Reshape the data to feed de recurrent network
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
print("Test data shape:")
print(x_test.shape)
print(y_test.shape)
Train data shape:
(385, 90, 1)
(385,)
Test data shape:
(90, 90, 1)
(90,)
# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
model = Sequential()
# Model with Neurons 
# Inputshape = neurons -> Timestamps
neurons= x_train.shape[1]
model.add(LSTM(90, 
               activation = 'relu',
               return_sequences = True, 
               input_shape = (x_train.shape[1], 1))) 
model.add(LSTM(50, 
               activation = 'relu',
               return_sequences = True)) 
model.add(LSTM(25, 
               activation = 'relu',
               return_sequences = False)) 
model.add(Dense(5, activation = 'relu'))
model.add(Dense(1))
Build model...
model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()
Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 lstm_21 (LSTM)              (None, 90, 90)            33120     
                                                                 
 lstm_22 (LSTM)              (None, 90, 50)            28200     
                                                                 
 lstm_23 (LSTM)              (None, 25)                7600      
                                                                 
 dense_14 (Dense)            (None, 5)                 130       
                                                                 
 dense_15 (Dense)            (None, 1)                 6         
                                                                 
=================================================================
Total params: 69,056
Trainable params: 69,056
Non-trainable params: 0
_________________________________________________________________
# Training the model
# fit network
history = model.fit(x_train, 
                    y_train, 
                    batch_size = 1000, 
                    epochs = 50, 
                    validation_data = (x_test, y_test), 
                    verbose = 0)
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show() 

# Get the predicted values
predictions = model.predict(x_test)
predictions = scaler.inverse_transform(predictions)
1/3 [=========>....................] - ETA: 0s
3/3 [==============================] - ETA: 0s
3/3 [==============================] - 0s 31ms/step
y_test = y_test.reshape(-1,1)
y_test = scaler.inverse_transform(y_test)
# Calculate the mean absolute error (MAE)
mae = mean_absolute_error(y_test, predictions)
print('MAE: ' + str(round(mae, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test,predictions))
print('RMSE: ' + str(round(rmse, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = mean_squared_error(y_test, 
                          predictions,
                          squared = False)
print('RMSE: ' + str(round(rmse, 1)))
MAE: 2521.6
RMSE: 5345.2
RMSE: 5345.2
# Add the difference between the valid and predicted prices
train = data_Madrid[:(len(x_train)+92)]
valid = data_Madrid[(len(x_train)+91):]
valid.insert(1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True)
# Zoom-in to a closer timeframe
# Date from which on the date is displayed
display_start_date = "2021-07-15" 
valid = valid[valid.index > display_start_date]
train = train[train.index > display_start_date]
# Visualize the data
matplotlib.style.use('ggplot')
fig, ax1 = plt.subplots(figsize=(22, 10), sharex=True)

# Data - Train
xt = train.index; 
yt = train[["num_casos"]]
# Data - Test / validation 
xv = valid.index; 
yv = valid[["num_casos", "Predictions"]]

# Plot
plt.title("Madrid: Predictions vs Real infections", fontsize=20)
plt.ylabel("Nº Cases", fontsize=18)

plt.plot(yt, color="blue", linewidth=1.5)
plt.plot(yv["Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.legend(["Train", "LSTM Predictions", "Test"], 
           loc="upper left", fontsize=18)

# Bar plot with the differences
x = valid.index
y = valid["Difference"]
plt.bar(x, y, width=0.2, color="grey")
plt.grid()
plt.show()

Malaga

data_Malaga = data_covid.loc[data_covid['provincia'] == 'Málaga']
data_Malaga = data_Malaga.set_index('fecha')
data_Malaga = data_Malaga.filter(['num_casos'])
data_Malaga = data_Malaga['2020-06-14':'2021-12-31']
data_Malaga
num_casos
fecha
2020-06-14 2
2020-06-15 1
2020-06-16 1
2020-06-17 2
2020-06-18 2
... ...
2021-12-27 1627
2021-12-28 2772
2021-12-29 3080
2021-12-30 3075
2021-12-31 2646

566 rows × 1 columns

data_Malaga.describe()
num_casos
count 566.000000
mean 339.227915
std 427.261346
min 1.000000
25% 110.000000
50% 193.500000
75% 344.750000
max 3080.000000
np_data_Malaga = data_Malaga.values
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data_Malaga = scaler.fit_transform(np_data_Malaga)
print(f'Longitud del conjunto de datos disponible: {len(scaled_data_Malaga)}')
Longitud del conjunto de datos disponible: 566
# Since we are going to predict future values based on the X past elements, 
# we need to create a list with those historic information for each element
historic_values = 90
scaled_data_Malaga_x = []
scaled_data_Malaga_y = []

for num_casos_i in range(historic_values, len(scaled_data_Malaga)):
    scaled_data_Malaga_x.append(scaled_data_Malaga[(num_casos_i-historic_values):num_casos_i, 0])
    scaled_data_Malaga_y.append(scaled_data_Malaga[num_casos_i, 0])

# Convert the x_train and y_train to numpy arrays
scaled_data_Malaga_x = np.array(scaled_data_Malaga_x)
scaled_data_Malaga_y = np.array(scaled_data_Malaga_y)
# Train data looks like
scaled_data_Malaga_x[235]
array([0.24423514, 0.21890224, 0.15621955, 0.12016889, 0.15329652,
       0.14420266, 0.14712569, 0.14647613, 0.11627152, 0.09418642,
       0.06722962, 0.09516077, 0.11042546, 0.06852874, 0.06950309,
       0.06658006, 0.04871712, 0.03442676, 0.05943488, 0.0490419 ,
       0.04936668, 0.04449497, 0.04157194, 0.03962325, 0.02923027,
       0.04157194, 0.04741799, 0.05131536, 0.04871712, 0.04287106,
       0.02760637, 0.02858071, 0.0422215 , 0.04092238, 0.03994804,
       0.03507632, 0.0506658 , 0.03020461, 0.02630724, 0.04092238,
       0.03117895, 0.04319584, 0.03702501, 0.0354011 , 0.02923027,
       0.02078597, 0.04157194, 0.04514453, 0.03832413, 0.04741799,
       0.0490419 , 0.0354011 , 0.02858071, 0.05326405, 0.05293927,
       0.05586229, 0.0422215 , 0.04027282, 0.03442676, 0.0490419 ,
       0.0659305 , 0.07794739, 0.0659305 , 0.05878532, 0.06203313,
       0.05034102, 0.03085417, 0.07047743, 0.05618707, 0.0776226 ,
       0.06138357, 0.07080221, 0.04124716, 0.04546931, 0.0659305 ,
       0.06690484, 0.05878532, 0.05488795, 0.05293927, 0.04352062,
       0.03637545, 0.0591101 , 0.06495615, 0.05975966, 0.05521273,
       0.05846054, 0.04481975, 0.04514453, 0.06235791, 0.07697304])
# Test data looks like
scaled_data_Malaga_y[235]
0.07340045469308218
# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_Malaga_y)}')
Longitud datos de entrenamiento con historico: 476
# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 91 days
x_train = scaled_data_Malaga_x[0:len(scaled_data_Malaga_x)-91]
y_train = scaled_data_Malaga_y[0:len(scaled_data_Malaga_y)-91]
print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')

x_test = scaled_data_Malaga_x[len(scaled_data_Malaga_x)-90:len(scaled_data_Malaga_x)]
y_test = scaled_data_Malaga_y[len(scaled_data_Malaga_y)-90:len(scaled_data_Malaga_y)]
print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')
Cantidad datos de entrenamiento: x=385 - y=385
Cantidad datos de test: x=90 - y=90
# Reshape the data to feed de recurrent network
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
print("Test data shape:")
print(x_test.shape)
print(y_test.shape)
Train data shape:
(385, 90, 1)
(385,)
Test data shape:
(90, 90, 1)
(90,)
# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
model = Sequential()
# Model with Neurons 
# Inputshape = neurons -> Timestamps
neurons= x_train.shape[1]
model.add(LSTM(90, 
               activation = 'relu',
               return_sequences = True, 
               input_shape = (x_train.shape[1], 1))) 
model.add(LSTM(50, 
               activation = 'relu',
               return_sequences = True)) 
model.add(LSTM(25, 
               activation = 'relu',
               return_sequences = False)) 
model.add(Dense(5, activation = 'relu'))
model.add(Dense(1))
Build model...
model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()
Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 lstm_24 (LSTM)              (None, 90, 90)            33120     
                                                                 
 lstm_25 (LSTM)              (None, 90, 50)            28200     
                                                                 
 lstm_26 (LSTM)              (None, 25)                7600      
                                                                 
 dense_16 (Dense)            (None, 5)                 130       
                                                                 
 dense_17 (Dense)            (None, 1)                 6         
                                                                 
=================================================================
Total params: 69,056
Trainable params: 69,056
Non-trainable params: 0
_________________________________________________________________
# Training the model
# fit network
history = model.fit(x_train, 
                    y_train, 
                    batch_size = 1000, 
                    epochs = 50, 
                    validation_data = (x_test, y_test), 
                    verbose = 0)
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show() 

# Get the predicted values
predictions = model.predict(x_test)
predictions = scaler.inverse_transform(predictions)
1/3 [=========>....................] - ETA: 0s
3/3 [==============================] - ETA: 0s
3/3 [==============================] - 0s 31ms/step
y_test = y_test.reshape(-1,1)
y_test = scaler.inverse_transform(y_test)
# Calculate the mean absolute error (MAE)
mae = mean_absolute_error(y_test, predictions)
print('MAE: ' + str(round(mae, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test,predictions))
print('RMSE: ' + str(round(rmse, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = mean_squared_error(y_test, 
                          predictions,
                          squared = False)
print('RMSE: ' + str(round(rmse, 1)))
MAE: 197.7
RMSE: 331.7
RMSE: 331.7
# Add the difference between the valid and predicted prices
train = data_Malaga[:(len(x_train)+92)]
valid = data_Malaga[(len(x_train)+91):]
valid.insert(1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True)
# Zoom-in to a closer timeframe
# Date from which on the date is displayed
display_start_date = "2021-07-15" 
valid = valid[valid.index > display_start_date]
train = train[train.index > display_start_date]
# Visualize the data
matplotlib.style.use('ggplot')
fig, ax1 = plt.subplots(figsize=(22, 10), sharex=True)

# Data - Train
xt = train.index; 
yt = train[["num_casos"]]
# Data - Test / validation 
xv = valid.index; 
yv = valid[["num_casos", "Predictions"]]

# Plot
plt.title("Malaga: Predictions vs Real infections", fontsize=20)
plt.ylabel("Nº Cases", fontsize=18)

plt.plot(yt, color="blue", linewidth=1.5)
plt.plot(yv["Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.legend(["Train", "LSTM Predictions", "Test"], 
           loc="upper left", fontsize=18)

# Bar plot with the differences
x = valid.index
y = valid["Difference"]
plt.bar(x, y, width=0.2, color="grey")
plt.grid()
plt.show()

Sevilla

data_Sevilla = data_covid.loc[data_covid['provincia'] == 'Sevilla']
data_Sevilla = data_Sevilla.set_index('fecha')
data_Sevilla = data_Sevilla.filter(['num_casos'])
data_Sevilla = data_Sevilla['2020-06-14':'2021-12-31']
data_Sevilla
num_casos
fecha
2020-06-14 0
2020-06-15 2
2020-06-16 1
2020-06-17 0
2020-06-18 0
... ...
2021-12-27 2617
2021-12-28 3190
2021-12-29 3692
2021-12-30 3508
2021-12-31 2816

566 rows × 1 columns

data_Sevilla.describe()
num_casos
count 566.000000
mean 383.807420
std 454.714408
min 0.000000
25% 112.750000
50% 260.500000
75% 459.000000
max 3692.000000
np_data_Sevilla = data_Sevilla.values
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data_Sevilla = scaler.fit_transform(np_data_Sevilla)
print(f'Longitud del conjunto de datos disponible: {len(scaled_data_Sevilla)}')
Longitud del conjunto de datos disponible: 566
# Since we are going to predict future values based on the X past elements, 
# we need to create a list with those historic information for each element
historic_values = 90
scaled_data_Sevilla_x = []
scaled_data_Sevilla_y = []

for num_casos_i in range(historic_values, len(scaled_data_Sevilla)):
    scaled_data_Sevilla_x.append(scaled_data_Sevilla[(num_casos_i-historic_values):num_casos_i, 0])
    scaled_data_Sevilla_y.append(scaled_data_Sevilla[num_casos_i, 0])

# Convert the x_train and y_train to numpy arrays
scaled_data_Sevilla_x = np.array(scaled_data_Sevilla_x)
scaled_data_Sevilla_y = np.array(scaled_data_Sevilla_y)
# Train data looks like
scaled_data_Sevilla_x[235]
array([0.18634886, 0.19799567, 0.14626219, 0.0896533 , 0.13894908,
       0.1695558 , 0.14572048, 0.12242687, 0.13434453, 0.06500542,
       0.05390033, 0.08206934, 0.08667389, 0.09452871, 0.06798483,
       0.05904659, 0.04252438, 0.0343987 , 0.06554713, 0.06473456,
       0.05444204, 0.0503792 , 0.05796316, 0.04577465, 0.03819068,
       0.04712893, 0.0528169 , 0.0663597 , 0.06013001, 0.05119177,
       0.03764897, 0.031961  , 0.05119177, 0.05010834, 0.04739978,
       0.05119177, 0.05390033, 0.03927411, 0.03737811, 0.06013001,
       0.05633803, 0.05606717, 0.05850488, 0.06473456, 0.04658722,
       0.04198267, 0.07150596, 0.07340195, 0.05823402, 0.08071506,
       0.07773564, 0.05877573, 0.05471289, 0.07990249, 0.08829902,
       0.0872156 , 0.09019502, 0.07069339, 0.08775731, 0.08396533,
       0.14951246, 0.12432286, 0.12269772, 0.12378115, 0.13028169,
       0.11348862, 0.07936078, 0.13299025, 0.12269772, 0.11213434,
       0.12107259, 0.12432286, 0.09723727, 0.07042254, 0.09886241,
       0.11782232, 0.10157096, 0.10346696, 0.12621885, 0.08342362,
       0.07340195, 0.0847779 , 0.08559047, 0.11159263, 0.10292524,
       0.08911159, 0.07502709, 0.04821235, 0.08315276, 0.09913326])
# Test data looks like
scaled_data_Sevilla_y[235]
0.10861321776814734
# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_Sevilla_y)}')
Longitud datos de entrenamiento con historico: 476
# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 91 days
x_train = scaled_data_Sevilla_x[0:len(scaled_data_Sevilla_x)-91]
y_train = scaled_data_Sevilla_y[0:len(scaled_data_Sevilla_y)-91]
print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')

x_test = scaled_data_Sevilla_x[len(scaled_data_Sevilla_x)-90:len(scaled_data_Sevilla_x)]
y_test = scaled_data_Sevilla_y[len(scaled_data_Sevilla_y)-90:len(scaled_data_Sevilla_y)]
print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')
Cantidad datos de entrenamiento: x=385 - y=385
Cantidad datos de test: x=90 - y=90
# Reshape the data to feed de recurrent network
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
print("Test data shape:")
print(x_test.shape)
print(y_test.shape)
Train data shape:
(385, 90, 1)
(385,)
Test data shape:
(90, 90, 1)
(90,)
# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
model = Sequential()
# Model with Neurons 
# Inputshape = neurons -> Timestamps
neurons= x_train.shape[1]
model.add(LSTM(90, 
               activation = 'relu',
               return_sequences = True, 
               input_shape = (x_train.shape[1], 1))) 
model.add(LSTM(50, 
               activation = 'relu',
               return_sequences = True)) 
model.add(LSTM(25, 
               activation = 'relu',
               return_sequences = False)) 
model.add(Dense(5, activation = 'relu'))
model.add(Dense(1))
Build model...
model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()
Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 lstm_27 (LSTM)              (None, 90, 90)            33120     
                                                                 
 lstm_28 (LSTM)              (None, 90, 50)            28200     
                                                                 
 lstm_29 (LSTM)              (None, 25)                7600      
                                                                 
 dense_18 (Dense)            (None, 5)                 130       
                                                                 
 dense_19 (Dense)            (None, 1)                 6         
                                                                 
=================================================================
Total params: 69,056
Trainable params: 69,056
Non-trainable params: 0
_________________________________________________________________
# Training the model
# fit network
history = model.fit(x_train, 
                    y_train, 
                    batch_size = 1000, 
                    epochs = 50, 
                    validation_data = (x_test, y_test), 
                    verbose = 0)
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show() 

# Get the predicted values
predictions = model.predict(x_test)
predictions = scaler.inverse_transform(predictions)
1/3 [=========>....................] - ETA: 0s
3/3 [==============================] - ETA: 0s
3/3 [==============================] - 0s 32ms/step
y_test = y_test.reshape(-1,1)
y_test = scaler.inverse_transform(y_test)
# Calculate the mean absolute error (MAE)
mae = mean_absolute_error(y_test, predictions)
print('MAE: ' + str(round(mae, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test,predictions))
print('RMSE: ' + str(round(rmse, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = mean_squared_error(y_test, 
                          predictions,
                          squared = False)
print('RMSE: ' + str(round(rmse, 1)))
MAE: 329.2
RMSE: 595.9
RMSE: 595.9
# Add the difference between the valid and predicted prices
train = data_Sevilla[:(len(x_train)+92)]
valid = data_Sevilla[(len(x_train)+91):]
valid.insert(1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True)
# Zoom-in to a closer timeframe
# Date from which on the date is displayed
display_start_date = "2021-07-15" 
valid = valid[valid.index > display_start_date]
train = train[train.index > display_start_date]
# Visualize the data
matplotlib.style.use('ggplot')
fig, ax1 = plt.subplots(figsize=(22, 10), sharex=True)

# Data - Train
xt = train.index; 
yt = train[["num_casos"]]
# Data - Test / validation 
xv = valid.index; 
yv = valid[["num_casos", "Predictions"]]

# Plot
plt.title("Sevilla: Predictions vs Real infections", fontsize=20)
plt.ylabel("Nº Cases", fontsize=18)

plt.plot(yt, color="blue", linewidth=1.5)
plt.plot(yv["Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.legend(["Train", "LSTM Predictions", "Test"], 
           loc="upper left", fontsize=18)

# Bar plot with the differences
x = valid.index
y = valid["Difference"]
plt.bar(x, y, width=0.2, color="grey")
plt.grid()
plt.show()