import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from keras import Input
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.layers.core import Dense, Activation, Dropout
LSTM univariate prediction
Import python packages:
= pd.read_csv('data/clean/final_covid_data.csv')
data_covid data_covid
provincia | fecha | num_casos | num_casos_prueba_pcr | num_casos_prueba_test_ac | num_casos_prueba_ag | num_casos_prueba_elisa | num_casos_prueba_desconocida | num_hosp | num_uci | ... | ws | ws_max | sol | mob_grocery_pharmacy | mob_parks | mob_residential | mob_retail_recreation | mob_transit_stations | mob_workplaces | mob_flujo | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Barcelona | 2020-01-01 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 2.5 | 7.2 | 4.9 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | Madrid | 2020-01-01 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0.8 | 3.6 | 8.3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | Málaga | 2020-01-01 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 3.3 | 6.7 | 7.7 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | Asturias | 2020-01-01 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 2.5 | 7.8 | 7.9 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | Sevilla | 2020-01-01 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 1.9 | 5.8 | 9.1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
4090 | Barcelona | 2022-03-29 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 7.2 | 13.3 | 0.0 | 0.0 | -13.0 | 5.0 | -24.0 | -16.0 | -17.0 | NaN |
4091 | Madrid | 2022-03-29 | 6 | 3 | 0 | 3 | 0 | 0 | 0 | 0 | ... | 2.2 | 6.1 | 2.4 | 1.0 | -11.0 | 4.0 | -25.0 | -16.0 | -16.0 | NaN |
4092 | Málaga | 2022-03-29 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 4.2 | 10.8 | 1.5 | 4.0 | -3.0 | 4.0 | -16.0 | 1.0 | -8.0 | NaN |
4093 | Asturias | 2022-03-29 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 2.2 | 6.7 | 0.0 | -4.0 | 17.0 | 3.0 | -25.0 | -15.0 | -12.0 | NaN |
4094 | Sevilla | 2022-03-29 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 2.5 | 8.3 | 1.6 | 3.0 | -7.0 | 2.0 | -15.0 | -13.0 | -5.0 | NaN |
4095 rows × 26 columns
All the available data
We will only detele data from the first wave since it is not reliable.
Asturias
= data_covid.loc[data_covid['provincia'] == 'Asturias']
data_asturias = data_asturias.set_index('fecha')
data_asturias = data_asturias.filter(['num_casos'])
data_asturias = data_asturias['2020-06-14':]
data_asturias data_asturias
num_casos | |
---|---|
fecha | |
2020-06-14 | 0 |
2020-06-15 | 0 |
2020-06-16 | 1 |
2020-06-17 | 0 |
2020-06-18 | 0 |
... | ... |
2022-03-25 | 244 |
2022-03-26 | 432 |
2022-03-27 | 1 |
2022-03-28 | 9 |
2022-03-29 | 0 |
654 rows × 1 columns
data_asturias.describe()
num_casos | |
---|---|
count | 654.00000 |
mean | 312.11315 |
std | 565.17985 |
min | 0.00000 |
25% | 43.50000 |
50% | 117.00000 |
75% | 323.75000 |
max | 3827.00000 |
= data_asturias.values np_data_asturias
= MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit_transform(np_data_asturias)
scaled_data_asturias print(f'Longitud del conjunto de datos disponible: {len(scaled_data_asturias)}')
Longitud del conjunto de datos disponible: 654
# Since we are going to predict future values based on the X past elements,
# we need to create a list with those historic information for each element
= 90
historic_values = []
scaled_data_asturias_x = []
scaled_data_asturias_y
for num_casos_i in range(historic_values, len(scaled_data_asturias)):
-historic_values):num_casos_i, 0])
scaled_data_asturias_x.append(scaled_data_asturias[(num_casos_i0])
scaled_data_asturias_y.append(scaled_data_asturias[num_casos_i,
# Convert the x_train and y_train to numpy arrays
= np.array(scaled_data_asturias_x)
scaled_data_asturias_x = np.array(scaled_data_asturias_y) scaled_data_asturias_y
# Train data looks like
235] scaled_data_asturias_x[
array([0.07002874, 0.0611445 , 0.07107395, 0.07394826, 0.06689313,
0.05644108, 0.05278286, 0.03841129, 0.03501437, 0.04285341,
0.04990854, 0.05016985, 0.04285341, 0.03710478, 0.03057225,
0.03240136, 0.03945649, 0.03971779, 0.03762738, 0.04311471,
0.02926574, 0.02691403, 0.03945649, 0.03919519, 0.03396917,
0.04206951, 0.03684348, 0.03841129, 0.03475307, 0.02795924,
0.02560753, 0.02743663, 0.03788869, 0.03814999, 0.02769794,
0.03031095, 0.02822054, 0.03893389, 0.03240136, 0.03553697,
0.03971779, 0.01802979, 0.01646198, 0.03004965, 0.03109485,
0.02926574, 0.04468252, 0.03266266, 0.02482362, 0.03031095,
0.03109485, 0.03893389, 0.03579828, 0.02325581, 0.03841129,
0.0195976 , 0.02508492, 0.02456232, 0.03057225, 0.03527567,
0.03919519, 0.03605958, 0.02534622, 0.02926574, 0.03240136,
0.04807944, 0.0399791 , 0.03396917, 0.02743663, 0.02325581,
0.02613013, 0.03083355, 0.03788869, 0.02848184, 0.03344656,
0.03396917, 0.02822054, 0.02247191, 0.02351712, 0.02586883,
0.02534622, 0.02874314, 0.0198589 , 0.0211654 , 0.01254246,
0.01907499, 0.01228116, 0.01228116, 0.01881369, 0.01515547])
# Test data looks like
235] scaled_data_asturias_y[
0.013326365299189964
# Since the first 90 values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_asturias_y)}')
Longitud datos de entrenamiento con historico: 564
# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 21 days
= scaled_data_asturias_x[0:len(scaled_data_asturias_x)-91]
x_train = scaled_data_asturias_y[0:len(scaled_data_asturias_y)-91]
y_train print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')
= scaled_data_asturias_x[len(scaled_data_asturias_x)-90:len(scaled_data_asturias_x)]
x_test = scaled_data_asturias_y[len(scaled_data_asturias_y)-90:len(scaled_data_asturias_y)]
y_test print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')
Cantidad datos de entrenamiento: x=473 - y=473
Cantidad datos de test: x=90 - y=90
# Reshape the data to feed de recurrent network
= np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
x_train print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
= np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
x_test print("Test data shape:")
print(x_test.shape)
print(y_test.shape)
Train data shape:
(473, 90, 1)
(473,)
Test data shape:
(90, 90, 1)
(90,)
# # Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
= Sequential()
model # Model with Neurons
# Inputshape = neurons -> Timestamps
= x_train.shape[1]
neurons90,
model.add(LSTM(= 'relu',
activation = True,
return_sequences = (x_train.shape[1], 1)))
input_shape 50,
model.add(LSTM(= 'relu',
activation = True))
return_sequences 25,
model.add(LSTM(= 'relu',
activation = False))
return_sequences 5, activation = 'relu'))
model.add(Dense(1)) model.add(Dense(
Build model...
2022-05-25 01:22:54.266575: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
compile(optimizer='adam', loss='mean_squared_error')
model. model.summary()
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm (LSTM) (None, 90, 90) 33120
lstm_1 (LSTM) (None, 90, 50) 28200
lstm_2 (LSTM) (None, 25) 7600
dense (Dense) (None, 5) 130
dense_1 (Dense) (None, 1) 6
=================================================================
Total params: 69,056
Trainable params: 69,056
Non-trainable params: 0
_________________________________________________________________
# Training the model
# # fit network
= model.fit(x_train,
history
y_train, =1000,
batch_size= 30,
epochs = (x_test, y_test),
validation_data = 0) verbose
'loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.plot(history.history[
plt.legend() plt.show()
# Get the predicted values
= model.predict(x_test)
predictions = scaler.inverse_transform(predictions) predictions
1/3 [=========>....................] - ETA: 0s
3/3 [==============================] - ETA: 0s
3/3 [==============================] - 0s 31ms/step
= y_test.reshape(-1,1)
y_test = scaler.inverse_transform(y_test) y_test
# Calculate the mean absolute error (MAE)
= mean_absolute_error(y_test, predictions)
mae print('MAE: ' + str(round(mae, 1)))
# Calculate the root mean squarred error (RMSE)
= np.sqrt(mean_squared_error(y_test,predictions))
rmse print('RMSE: ' + str(round(rmse, 1)))
# Calculate the root mean squarred error (RMSE)
= mean_squared_error(y_test,
rmse
predictions,= False)
squared print('RMSE: ' + str(round(rmse, 1)))
MAE: 1074.0
RMSE: 1479.1
RMSE: 1479.1
# Add the difference between the valid and predicted prices
= data_asturias[:(len(x_train)+92)]
train = data_asturias[(len(x_train)+91):] valid
1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True) valid.insert(
# Zoom-in to a closer timeframe
# Date from which on the date is displayed
= "2021-10-31"
display_start_date = valid[valid.index > display_start_date]
valid = train[train.index > display_start_date] train
# Visualize the data
'ggplot')
matplotlib.style.use(= plt.subplots(figsize=(22, 10), sharex=True)
fig, ax1
# Data - Train
= train.index;
xt = train[["num_casos"]]
yt # Data - Test / validation
= valid.index;
xv = valid[["num_casos", "Predictions"]]
yv
# Plot
"Asturias: Predictions vs Real infections", fontsize=20)
plt.title("Nº Cases", fontsize=18)
plt.ylabel(
="blue", linewidth=1.5)
plt.plot(yt, color"Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.plot(yv["Train", "LSTM Predictions", "Test"],
plt.legend([="upper left", fontsize=18)
loc
# Bar plot with the differences
= valid.index
x = valid["Difference"]
y =0.2, color="grey")
plt.bar(x, y, width
plt.grid() plt.show()
Barcelona
= data_covid.loc[data_covid['provincia'] == 'Barcelona']
data_Barcelona = data_Barcelona.set_index('fecha')
data_Barcelona = data_Barcelona.filter(['num_casos'])
data_Barcelona = data_Barcelona['2020-06-14':]
data_Barcelona data_Barcelona
num_casos | |
---|---|
fecha | |
2020-06-14 | 33 |
2020-06-15 | 62 |
2020-06-16 | 66 |
2020-06-17 | 70 |
2020-06-18 | 68 |
... | ... |
2022-03-25 | 598 |
2022-03-26 | 345 |
2022-03-27 | 252 |
2022-03-28 | 688 |
2022-03-29 | 0 |
654 rows × 1 columns
data_Barcelona.describe()
num_casos | |
---|---|
count | 654.000000 |
mean | 2605.048930 |
std | 4823.001644 |
min | 0.000000 |
25% | 597.250000 |
50% | 1030.500000 |
75% | 2243.750000 |
max | 34701.000000 |
= data_Barcelona.values np_data_Barcelona
= MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit_transform(np_data_Barcelona)
scaled_data_Barcelona print(f'Longitud del conjunto de datos disponible: {len(scaled_data_Barcelona)}')
Longitud del conjunto de datos disponible: 654
# Since we are going to predict future values based on the 90 past elements,
# we need to create a list with those historic information for each element
= 90
historic_values = []
scaled_data_Barcelona_x = []
scaled_data_Barcelona_y
for num_casos_i in range(historic_values, len(scaled_data_Barcelona)):
-historic_values):num_casos_i, 0])
scaled_data_Barcelona_x.append(scaled_data_Barcelona[(num_casos_i0])
scaled_data_Barcelona_y.append(scaled_data_Barcelona[num_casos_i,
# Convert the x_train and y_train to numpy arrays
= np.array(scaled_data_Barcelona_x)
scaled_data_Barcelona_x = np.array(scaled_data_Barcelona_y) scaled_data_Barcelona_y
# Train data looks like
235] scaled_data_Barcelona_x[
array([0.05037319, 0.04726089, 0.02899052, 0.02507132, 0.04639636,
0.04190081, 0.03749171, 0.03527276, 0.03786634, 0.02619521,
0.02351517, 0.04077692, 0.03760699, 0.03587793, 0.0364831 ,
0.03120948, 0.02449497, 0.02083513, 0.03792398, 0.03613729,
0.03195873, 0.02754964, 0.02818363, 0.02025878, 0.01752111,
0.03484049, 0.03037376, 0.02870234, 0.0233999 , 0.02610876,
0.01876027, 0.01645486, 0.0282989 , 0.02625285, 0.02680038,
0.02564768, 0.02596467, 0.01953834, 0.01746347, 0.03184346,
0.02973978, 0.02573413, 0.02838535, 0.02919224, 0.02190139,
0.01890435, 0.03720354, 0.03642546, 0.0325639 , 0.0304314 ,
0.03556093, 0.02524423, 0.02149794, 0.03803925, 0.03426414,
0.03048903, 0.03365897, 0.02218956, 0.02530186, 0.02533068,
0.02832771, 0.03835624, 0.03740526, 0.03512867, 0.03680009,
0.02855825, 0.02118095, 0.03878851, 0.03507104, 0.03985476,
0.03155529, 0.03394715, 0.02423561, 0.02169966, 0.03953777,
0.04596409, 0.0350134 , 0.0395954 , 0.03267917, 0.02772254,
0.02380335, 0.04668453, 0.04213135, 0.03806807, 0.03671364,
0.03509985, 0.02106568, 0.01694476, 0.03803925, 0.02815481])
# Test data looks like
235] scaled_data_Barcelona_y[
0.030661940578081325
# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_Barcelona_y)}')
Longitud datos de entrenamiento con historico: 564
# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 90 days
= scaled_data_Barcelona_x[0:len(scaled_data_Barcelona_x)-91]
x_train = scaled_data_Barcelona_y[0:len(scaled_data_Barcelona_y)-91]
y_train print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')
= scaled_data_Barcelona_x[len(scaled_data_Barcelona_x)-90:len(scaled_data_Barcelona_x)]
x_test = scaled_data_Barcelona_y[len(scaled_data_Barcelona_y)-90:len(scaled_data_Barcelona_y)]
y_test print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')
Cantidad datos de entrenamiento: x=473 - y=473
Cantidad datos de test: x=90 - y=90
# Reshape the data to feed de recurrent network
= np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
x_train print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
= np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
x_test print("Test data shape:")
print(x_test.shape)
print(y_test.shape)
Train data shape:
(473, 90, 1)
(473,)
Test data shape:
(90, 90, 1)
(90,)
# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
= Sequential()
model # Model with Neurons
# Inputshape = neurons -> Timestamps
= x_train.shape[1]
neurons90,
model.add(LSTM(= 'relu',
activation = True,
return_sequences = (x_train.shape[1], 1)))
input_shape 50,
model.add(LSTM(= 'relu',
activation = True))
return_sequences 25,
model.add(LSTM(= 'relu',
activation = False))
return_sequences 5, activation = 'relu'))
model.add(Dense(1)) model.add(Dense(
Build model...
compile(optimizer='adam', loss='mean_squared_error')
model. model.summary()
Model: "sequential_1"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm_3 (LSTM) (None, 90, 90) 33120
lstm_4 (LSTM) (None, 90, 50) 28200
lstm_5 (LSTM) (None, 25) 7600
dense_2 (Dense) (None, 5) 130
dense_3 (Dense) (None, 1) 6
=================================================================
Total params: 69,056
Trainable params: 69,056
Non-trainable params: 0
_________________________________________________________________
# Training the model
# fit network
= model.fit(x_train,
history
y_train, = 1000,
batch_size = 30,
epochs = (x_test, y_test),
validation_data = 0) verbose
'loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.plot(history.history[
plt.legend() plt.show()
# Get the predicted values
= model.predict(x_test)
predictions = scaler.inverse_transform(predictions) predictions
1/3 [=========>....................] - ETA: 0s
3/3 [==============================] - ETA: 0s
3/3 [==============================] - 0s 31ms/step
= y_test.reshape(-1,1)
y_test = scaler.inverse_transform(y_test) y_test
# Calculate the mean absolute error (MAE)
= mean_absolute_error(y_test, predictions)
mae print('MAE: ' + str(round(mae, 1)))
# Calculate the root mean squarred error (RMSE)
= np.sqrt(mean_squared_error(y_test,predictions))
rmse print('RMSE: ' + str(round(rmse, 1)))
# Calculate the root mean squarred error (RMSE)
= mean_squared_error(y_test,
rmse
predictions,= False)
squared print('RMSE: ' + str(round(rmse, 1)))
MAE: 5228.7
RMSE: 7609.8
RMSE: 7609.8
# Add the difference between the valid and predicted prices
= data_Barcelona[:(len(x_train)+92)]
train = data_Barcelona[(len(x_train)+91):] valid
1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True) valid.insert(
# Zoom-in to a closer timeframe
# Date from which on the date is displayed
= "2021-10-31"
display_start_date = valid[valid.index > display_start_date]
valid = train[train.index > display_start_date] train
# Visualize the data
'ggplot')
matplotlib.style.use(= plt.subplots(figsize=(22, 10), sharex=True)
fig, ax1
# Data - Train
= train.index;
xt = train[["num_casos"]]
yt # Data - Test / validation
= valid.index;
xv = valid[["num_casos", "Predictions"]]
yv
# Plot
"Barcelona: Predictions vs Real infections", fontsize=20)
plt.title("Nº Cases", fontsize=18)
plt.ylabel(
="blue", linewidth=1.5)
plt.plot(yt, color"Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.plot(yv["Train", "LSTM Predictions", "Test"],
plt.legend([="upper left", fontsize=18)
loc
# Bar plot with the differences
= valid.index
x = valid["Difference"]
y =0.2, color="grey")
plt.bar(x, y, width
plt.grid() plt.show()
Madrid
= data_covid.loc[data_covid['provincia'] == 'Madrid']
data_Madrid = data_Madrid.set_index('fecha')
data_Madrid = data_Madrid.filter(['num_casos'])
data_Madrid = data_Madrid['2020-06-14':]
data_Madrid data_Madrid
num_casos | |
---|---|
fecha | |
2020-06-14 | 81 |
2020-06-15 | 153 |
2020-06-16 | 91 |
2020-06-17 | 93 |
2020-06-18 | 85 |
... | ... |
2022-03-25 | 356 |
2022-03-26 | 303 |
2022-03-27 | 77 |
2022-03-28 | 839 |
2022-03-29 | 6 |
654 rows × 1 columns
data_Madrid.describe()
num_casos | |
---|---|
count | 654.000000 |
mean | 2392.562691 |
std | 3390.272836 |
min | 6.000000 |
25% | 662.750000 |
50% | 1413.000000 |
75% | 2475.750000 |
max | 23811.000000 |
= data_Madrid.values np_data_Madrid
= MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit_transform(np_data_Madrid)
scaled_data_Madrid print(f'Longitud del conjunto de datos disponible: {len(scaled_data_Madrid)}')
Longitud del conjunto de datos disponible: 654
# Since we are going to predict future values based on the X past elements,
# we need to create a list with those historic information for each element
= 90
historic_values = []
scaled_data_Madrid_x = []
scaled_data_Madrid_y
for num_casos_i in range(historic_values, len(scaled_data_Madrid)):
-historic_values):num_casos_i, 0])
scaled_data_Madrid_x.append(scaled_data_Madrid[(num_casos_i0])
scaled_data_Madrid_y.append(scaled_data_Madrid[num_casos_i,
# Convert the x_train and y_train to numpy arrays
= np.array(scaled_data_Madrid_x)
scaled_data_Madrid_x = np.array(scaled_data_Madrid_y) scaled_data_Madrid_y
# Train data looks like
235] scaled_data_Madrid_x[
array([0.11510187, 0.14820416, 0.08880487, 0.06481832, 0.11169922,
0.11354757, 0.089477 , 0.0694812 , 0.09300567, 0.06133165,
0.04709095, 0.08447805, 0.07746272, 0.06595253, 0.05708885,
0.06515438, 0.04835119, 0.04377232, 0.06372611, 0.05931527,
0.05141777, 0.04541063, 0.05330813, 0.04150389, 0.04032766,
0.05486242, 0.05225793, 0.04822516, 0.04259609, 0.05246797,
0.03919345, 0.03860534, 0.05561857, 0.05452636, 0.04814115,
0.04448645, 0.05288805, 0.04125184, 0.03994959, 0.05448435,
0.05183785, 0.0531401 , 0.05713085, 0.0431842 , 0.05061962,
0.0478891 , 0.07292586, 0.07830288, 0.0626339 , 0.05902121,
0.07485822, 0.05326612, 0.05335014, 0.07111951, 0.07095148,
0.07784079, 0.05469439, 0.06166772, 0.07149758, 0.07456417,
0.09880277, 0.09850872, 0.0857803 , 0.08178954, 0.08951901,
0.07225373, 0.05977736, 0.09750053, 0.09762655, 0.08930897,
0.08544423, 0.09489603, 0.07187566, 0.06406217, 0.09329973,
0.09855072, 0.08435203, 0.07653854, 0.08905692, 0.0647343 ,
0.05486242, 0.08237765, 0.0873766 , 0.07326192, 0.06259189,
0.07485822, 0.04654484, 0.04494854, 0.04721697, 0.06700273])
# Test data looks like
235] scaled_data_Madrid_y[
0.0648183154799412
# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_Madrid_y)}')
Longitud datos de entrenamiento con historico: 564
# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 91 days
= scaled_data_Madrid_x[0:len(scaled_data_Madrid_x)-91]
x_train = scaled_data_Madrid_y[0:len(scaled_data_Madrid_y)-91]
y_train print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')
= scaled_data_Madrid_x[len(scaled_data_Madrid_x)-90:len(scaled_data_Madrid_x)]
x_test = scaled_data_Madrid_y[len(scaled_data_Madrid_y)-90:len(scaled_data_Madrid_y)]
y_test print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')
Cantidad datos de entrenamiento: x=473 - y=473
Cantidad datos de test: x=90 - y=90
# Reshape the data to feed de recurrent network
= np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
x_train print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
= np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
x_test print("Test data shape:")
print(x_test.shape)
print(y_test.shape)
Train data shape:
(473, 90, 1)
(473,)
Test data shape:
(90, 90, 1)
(90,)
# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
= Sequential()
model # Model with Neurons
# Inputshape = neurons -> Timestamps
= x_train.shape[1]
neurons90,
model.add(LSTM(= 'relu',
activation = True,
return_sequences = (x_train.shape[1], 1)))
input_shape 50,
model.add(LSTM(= 'relu',
activation = True))
return_sequences 25,
model.add(LSTM(= 'relu',
activation = False))
return_sequences 5, activation = 'relu'))
model.add(Dense(1)) model.add(Dense(
Build model...
compile(optimizer='adam', loss='mean_squared_error')
model. model.summary()
Model: "sequential_2"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm_6 (LSTM) (None, 90, 90) 33120
lstm_7 (LSTM) (None, 90, 50) 28200
lstm_8 (LSTM) (None, 25) 7600
dense_4 (Dense) (None, 5) 130
dense_5 (Dense) (None, 1) 6
=================================================================
Total params: 69,056
Trainable params: 69,056
Non-trainable params: 0
_________________________________________________________________
# Training the model
# fit network
= model.fit(x_train,
history
y_train, = 1000,
batch_size = 30,
epochs = (x_test, y_test),
validation_data = 0) verbose
'loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.plot(history.history[
plt.legend() plt.show()
# Get the predicted values
= model.predict(x_test)
predictions = scaler.inverse_transform(predictions) predictions
1/3 [=========>....................] - ETA: 0s
3/3 [==============================] - ETA: 0s
3/3 [==============================] - 0s 30ms/step
= y_test.reshape(-1,1)
y_test = scaler.inverse_transform(y_test) y_test
# Calculate the mean absolute error (MAE)
= mean_absolute_error(y_test, predictions)
mae print('MAE: ' + str(round(mae, 1)))
# Calculate the root mean squarred error (RMSE)
= np.sqrt(mean_squared_error(y_test,predictions))
rmse print('RMSE: ' + str(round(rmse, 1)))
# Calculate the root mean squarred error (RMSE)
= mean_squared_error(y_test,
rmse
predictions,= False)
squared print('RMSE: ' + str(round(rmse, 1)))
MAE: 4748.2
RMSE: 6975.8
RMSE: 6975.8
# Add the difference between the valid and predicted prices
= data_Madrid[:(len(x_train)+92)]
train = data_Madrid[(len(x_train)+91):] valid
1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True) valid.insert(
# Zoom-in to a closer timeframe
# Date from which on the date is displayed
= "2021-10-31"
display_start_date = valid[valid.index > display_start_date]
valid = train[train.index > display_start_date] train
# Visualize the data
'ggplot')
matplotlib.style.use(= plt.subplots(figsize=(22, 10), sharex=True)
fig, ax1
# Data - Train
= train.index;
xt = train[["num_casos"]]
yt # Data - Test / validation
= valid.index;
xv = valid[["num_casos", "Predictions"]]
yv
# Plot
"Madrid: Predictions vs Real infections", fontsize=20)
plt.title("Nº Cases", fontsize=18)
plt.ylabel(
="blue", linewidth=1.5)
plt.plot(yt, color"Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.plot(yv["Train", "LSTM Predictions", "Test"],
plt.legend([="upper left", fontsize=18)
loc
# Bar plot with the differences
= valid.index
x = valid["Difference"]
y =0.2, color="grey")
plt.bar(x, y, width
plt.grid() plt.show()
Malaga
= data_covid.loc[data_covid['provincia'] == 'Málaga']
data_Malaga = data_Malaga.set_index('fecha')
data_Malaga = data_Malaga.filter(['num_casos'])
data_Malaga = data_Malaga['2020-06-14':]
data_Malaga data_Malaga
num_casos | |
---|---|
fecha | |
2020-06-14 | 2 |
2020-06-15 | 1 |
2020-06-16 | 1 |
2020-06-17 | 2 |
2020-06-18 | 2 |
... | ... |
2022-03-25 | 565 |
2022-03-26 | 79 |
2022-03-27 | 65 |
2022-03-28 | 39 |
2022-03-29 | 0 |
654 rows × 1 columns
data_Malaga.describe()
num_casos | |
---|---|
count | 654.000000 |
mean | 410.206422 |
std | 489.452348 |
min | 0.000000 |
25% | 122.250000 |
50% | 215.500000 |
75% | 475.250000 |
max | 3080.000000 |
= data_Malaga.values np_data_Malaga
= MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit_transform(np_data_Malaga)
scaled_data_Malaga print(f'Longitud del conjunto de datos disponible: {len(scaled_data_Malaga)}')
Longitud del conjunto de datos disponible: 654
# Since we are going to predict future values based on the X past elements,
# we need to create a list with those historic information for each element
= 90
historic_values = []
scaled_data_Malaga_x = []
scaled_data_Malaga_y
for num_casos_i in range(historic_values, len(scaled_data_Malaga)):
-historic_values):num_casos_i, 0])
scaled_data_Malaga_x.append(scaled_data_Malaga[(num_casos_i0])
scaled_data_Malaga_y.append(scaled_data_Malaga[num_casos_i,
# Convert the x_train and y_train to numpy arrays
= np.array(scaled_data_Malaga_x)
scaled_data_Malaga_x = np.array(scaled_data_Malaga_y) scaled_data_Malaga_y
# Train data looks like
235] scaled_data_Malaga_x[
array([0.24448052, 0.21915584, 0.15649351, 0.12045455, 0.15357143,
0.14448052, 0.1474026 , 0.14675325, 0.11655844, 0.09448052,
0.06753247, 0.09545455, 0.11071429, 0.06883117, 0.06980519,
0.06688312, 0.04902597, 0.03474026, 0.05974026, 0.04935065,
0.04967532, 0.04480519, 0.04188312, 0.03993506, 0.02954545,
0.04188312, 0.04772727, 0.05162338, 0.04902597, 0.04318182,
0.02792208, 0.0288961 , 0.04253247, 0.04123377, 0.04025974,
0.03538961, 0.05097403, 0.03051948, 0.02662338, 0.04123377,
0.03149351, 0.04350649, 0.03733766, 0.03571429, 0.02954545,
0.0211039 , 0.04188312, 0.04545455, 0.03863636, 0.04772727,
0.04935065, 0.03571429, 0.0288961 , 0.05357143, 0.05324675,
0.05616883, 0.04253247, 0.04058442, 0.03474026, 0.04935065,
0.06623377, 0.07824675, 0.06623377, 0.05909091, 0.06233766,
0.05064935, 0.03116883, 0.07077922, 0.05649351, 0.07792208,
0.06168831, 0.0711039 , 0.04155844, 0.04577922, 0.06623377,
0.06720779, 0.05909091, 0.05519481, 0.05324675, 0.04383117,
0.03668831, 0.05941558, 0.06525974, 0.06006494, 0.05551948,
0.05876623, 0.04512987, 0.04545455, 0.06266234, 0.07727273])
# Test data looks like
235] scaled_data_Malaga_y[
0.0737012987012987
# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_Malaga_y)}')
Longitud datos de entrenamiento con historico: 564
# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 91 days
= scaled_data_Malaga_x[0:len(scaled_data_Malaga_x)-91]
x_train = scaled_data_Malaga_y[0:len(scaled_data_Malaga_y)-91]
y_train print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')
= scaled_data_Malaga_x[len(scaled_data_Malaga_x)-90:len(scaled_data_Malaga_x)]
x_test = scaled_data_Malaga_y[len(scaled_data_Malaga_y)-90:len(scaled_data_Malaga_y)]
y_test print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')
Cantidad datos de entrenamiento: x=473 - y=473
Cantidad datos de test: x=90 - y=90
# Reshape the data to feed de recurrent network
= np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
x_train print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
= np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
x_test print("Test data shape:")
print(x_test.shape)
print(y_test.shape)
Train data shape:
(473, 90, 1)
(473,)
Test data shape:
(90, 90, 1)
(90,)
# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
= Sequential()
model # Model with Neurons
# Inputshape = neurons -> Timestamps
= x_train.shape[1]
neurons90,
model.add(LSTM(= 'relu',
activation = True,
return_sequences = (x_train.shape[1], 1)))
input_shape 50,
model.add(LSTM(= 'relu',
activation = True))
return_sequences 25,
model.add(LSTM(= 'relu',
activation = False))
return_sequences 5, activation = 'relu'))
model.add(Dense(1)) model.add(Dense(
Build model...
compile(optimizer='adam', loss='mean_squared_error')
model. model.summary()
Model: "sequential_3"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm_9 (LSTM) (None, 90, 90) 33120
lstm_10 (LSTM) (None, 90, 50) 28200
lstm_11 (LSTM) (None, 25) 7600
dense_6 (Dense) (None, 5) 130
dense_7 (Dense) (None, 1) 6
=================================================================
Total params: 69,056
Trainable params: 69,056
Non-trainable params: 0
_________________________________________________________________
# Training the model
# fit network
= model.fit(x_train,
history
y_train, = 1000,
batch_size = 30,
epochs = (x_test, y_test),
validation_data = 0) verbose
'loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.plot(history.history[
plt.legend() plt.show()
# Get the predicted values
= model.predict(x_test)
predictions = scaler.inverse_transform(predictions) predictions
1/3 [=========>....................] - ETA: 0s
3/3 [==============================] - ETA: 0s
3/3 [==============================] - 0s 31ms/step
= y_test.reshape(-1,1)
y_test = scaler.inverse_transform(y_test) y_test
# Calculate the mean absolute error (MAE)
= mean_absolute_error(y_test, predictions)
mae print('MAE: ' + str(round(mae, 1)))
# Calculate the root mean squarred error (RMSE)
= np.sqrt(mean_squared_error(y_test,predictions))
rmse print('RMSE: ' + str(round(rmse, 1)))
# Calculate the root mean squarred error (RMSE)
= mean_squared_error(y_test,
rmse
predictions,= False)
squared print('RMSE: ' + str(round(rmse, 1)))
MAE: 353.5
RMSE: 481.1
RMSE: 481.1
# Add the difference between the valid and predicted prices
= data_Malaga[:(len(x_train)+92)]
train = data_Malaga[(len(x_train)+91):] valid
1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True) valid.insert(
# Zoom-in to a closer timeframe
# Date from which on the date is displayed
= "2021-10-31"
display_start_date = valid[valid.index > display_start_date]
valid = train[train.index > display_start_date] train
# Visualize the data
'ggplot')
matplotlib.style.use(= plt.subplots(figsize=(22, 10), sharex=True)
fig, ax1
# Data - Train
= train.index;
xt = train[["num_casos"]]
yt # Data - Test / validation
= valid.index;
xv = valid[["num_casos", "Predictions"]]
yv
# Plot
"Malaga: Predictions vs Real infections", fontsize=20)
plt.title("Nº Cases", fontsize=18)
plt.ylabel(
="blue", linewidth=1.5)
plt.plot(yt, color"Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.plot(yv["Train", "LSTM Predictions", "Test"],
plt.legend([="upper left", fontsize=18)
loc
# Bar plot with the differences
= valid.index
x = valid["Difference"]
y =0.2, color="grey")
plt.bar(x, y, width
plt.grid() plt.show()
Sevilla
= data_covid.loc[data_covid['provincia'] == 'Sevilla']
data_Sevilla = data_Sevilla.set_index('fecha')
data_Sevilla = data_Sevilla.filter(['num_casos'])
data_Sevilla = data_Sevilla['2020-06-14':]
data_Sevilla data_Sevilla
num_casos | |
---|---|
fecha | |
2020-06-14 | 0 |
2020-06-15 | 2 |
2020-06-16 | 1 |
2020-06-17 | 0 |
2020-06-18 | 0 |
... | ... |
2022-03-25 | 365 |
2022-03-26 | 67 |
2022-03-27 | 24 |
2022-03-28 | 12 |
2022-03-29 | 0 |
654 rows × 1 columns
data_Sevilla.describe()
num_casos | |
---|---|
count | 654.000000 |
mean | 432.978593 |
std | 500.618517 |
min | 0.000000 |
25% | 127.250000 |
50% | 282.000000 |
75% | 528.250000 |
max | 3692.000000 |
= data_Sevilla.values np_data_Sevilla
= MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit_transform(np_data_Sevilla)
scaled_data_Sevilla print(f'Longitud del conjunto de datos disponible: {len(scaled_data_Sevilla)}')
Longitud del conjunto de datos disponible: 654
# Since we are going to predict future values based on the X past elements,
# we need to create a list with those historic information for each element
= 90
historic_values = []
scaled_data_Sevilla_x = []
scaled_data_Sevilla_y
for num_casos_i in range(historic_values, len(scaled_data_Sevilla)):
-historic_values):num_casos_i, 0])
scaled_data_Sevilla_x.append(scaled_data_Sevilla[(num_casos_i0])
scaled_data_Sevilla_y.append(scaled_data_Sevilla[num_casos_i,
# Convert the x_train and y_train to numpy arrays
= np.array(scaled_data_Sevilla_x)
scaled_data_Sevilla_x = np.array(scaled_data_Sevilla_y) scaled_data_Sevilla_y
# Train data looks like
235] scaled_data_Sevilla_x[
array([0.18634886, 0.19799567, 0.14626219, 0.0896533 , 0.13894908,
0.1695558 , 0.14572048, 0.12242687, 0.13434453, 0.06500542,
0.05390033, 0.08206934, 0.08667389, 0.09452871, 0.06798483,
0.05904659, 0.04252438, 0.0343987 , 0.06554713, 0.06473456,
0.05444204, 0.0503792 , 0.05796316, 0.04577465, 0.03819068,
0.04712893, 0.0528169 , 0.0663597 , 0.06013001, 0.05119177,
0.03764897, 0.031961 , 0.05119177, 0.05010834, 0.04739978,
0.05119177, 0.05390033, 0.03927411, 0.03737811, 0.06013001,
0.05633803, 0.05606717, 0.05850488, 0.06473456, 0.04658722,
0.04198267, 0.07150596, 0.07340195, 0.05823402, 0.08071506,
0.07773564, 0.05877573, 0.05471289, 0.07990249, 0.08829902,
0.0872156 , 0.09019502, 0.07069339, 0.08775731, 0.08396533,
0.14951246, 0.12432286, 0.12269772, 0.12378115, 0.13028169,
0.11348862, 0.07936078, 0.13299025, 0.12269772, 0.11213434,
0.12107259, 0.12432286, 0.09723727, 0.07042254, 0.09886241,
0.11782232, 0.10157096, 0.10346696, 0.12621885, 0.08342362,
0.07340195, 0.0847779 , 0.08559047, 0.11159263, 0.10292524,
0.08911159, 0.07502709, 0.04821235, 0.08315276, 0.09913326])
# Test data looks like
235] scaled_data_Sevilla_y[
0.10861321776814734
# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_Sevilla_y)}')
Longitud datos de entrenamiento con historico: 564
# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 91 days
= scaled_data_Sevilla_x[0:len(scaled_data_Sevilla_x)-91]
x_train = scaled_data_Sevilla_y[0:len(scaled_data_Sevilla_y)-91]
y_train print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')
= scaled_data_Sevilla_x[len(scaled_data_Sevilla_x)-90:len(scaled_data_Sevilla_x)]
x_test = scaled_data_Sevilla_y[len(scaled_data_Sevilla_y)-90:len(scaled_data_Sevilla_y)]
y_test print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')
Cantidad datos de entrenamiento: x=473 - y=473
Cantidad datos de test: x=90 - y=90
# Reshape the data to feed de recurrent network
= np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
x_train print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
= np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
x_test print("Test data shape:")
print(x_test.shape)
print(y_test.shape)
Train data shape:
(473, 90, 1)
(473,)
Test data shape:
(90, 90, 1)
(90,)
# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
= Sequential()
model # Model with Neurons
# Inputshape = neurons -> Timestamps
= x_train.shape[1]
neurons90,
model.add(LSTM(= 'relu',
activation = True,
return_sequences = (x_train.shape[1], 1)))
input_shape 50,
model.add(LSTM(= 'relu',
activation = True))
return_sequences 25,
model.add(LSTM(= 'relu',
activation = False))
return_sequences 5, activation = 'relu'))
model.add(Dense(1)) model.add(Dense(
Build model...
compile(optimizer='adam', loss='mean_squared_error')
model. model.summary()
Model: "sequential_4"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm_12 (LSTM) (None, 90, 90) 33120
lstm_13 (LSTM) (None, 90, 50) 28200
lstm_14 (LSTM) (None, 25) 7600
dense_8 (Dense) (None, 5) 130
dense_9 (Dense) (None, 1) 6
=================================================================
Total params: 69,056
Trainable params: 69,056
Non-trainable params: 0
_________________________________________________________________
# Training the model
# fit network
= model.fit(x_train,
history
y_train, = 1000,
batch_size = 30,
epochs = (x_test, y_test),
validation_data = 0) verbose
'loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.plot(history.history[
plt.legend() plt.show()
# Get the predicted values
= model.predict(x_test)
predictions = scaler.inverse_transform(predictions) predictions
WARNING:tensorflow:5 out of the last 13 calls to <function Model.make_predict_function.<locals>.predict_function at 0x7fdc69aa8c10> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has reduce_retracing=True option that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for more details.
1/3 [=========>....................] - ETA: 0s
3/3 [==============================] - ETA: 0s
3/3 [==============================] - 0s 32ms/step
= y_test.reshape(-1,1)
y_test = scaler.inverse_transform(y_test) y_test
# Calculate the mean absolute error (MAE)
= mean_absolute_error(y_test, predictions)
mae print('MAE: ' + str(round(mae, 1)))
# Calculate the root mean squarred error (RMSE)
= np.sqrt(mean_squared_error(y_test,predictions))
rmse print('RMSE: ' + str(round(rmse, 1)))
# Calculate the root mean squarred error (RMSE)
= mean_squared_error(y_test,
rmse
predictions,= False)
squared print('RMSE: ' + str(round(rmse, 1)))
MAE: 335.1
RMSE: 474.0
RMSE: 474.0
# Add the difference between the valid and predicted prices
= data_Sevilla[:(len(x_train)+92)]
train = data_Sevilla[(len(x_train)+91):] valid
1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True) valid.insert(
# Zoom-in to a closer timeframe
# Date from which on the date is displayed
= "2021-10-31"
display_start_date = valid[valid.index > display_start_date]
valid = train[train.index > display_start_date] train
# Visualize the data
'ggplot')
matplotlib.style.use(= plt.subplots(figsize=(22, 10), sharex=True)
fig, ax1
# Data - Train
= train.index;
xt = train[["num_casos"]]
yt # Data - Test / validation
= valid.index;
xv = valid[["num_casos", "Predictions"]]
yv
# Plot
"Sevilla: Predictions vs Real infections", fontsize=20)
plt.title("Nº Cases", fontsize=18)
plt.ylabel(
="blue", linewidth=1.5)
plt.plot(yt, color"Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.plot(yv["Train", "LSTM Predictions", "Test"],
plt.legend([="upper left", fontsize=18)
loc
# Bar plot with the differences
= valid.index
x = valid["Difference"]
y =0.2, color="grey")
plt.bar(x, y, width
plt.grid() plt.show()
Just before the sixth wave
Asturias
= data_covid.loc[data_covid['provincia'] == 'Asturias']
data_asturias = data_asturias.set_index('fecha')
data_asturias = data_asturias.filter(['num_casos'])
data_asturias = data_asturias['2020-06-14':'2021-12-31']
data_asturias data_asturias
num_casos | |
---|---|
fecha | |
2020-06-14 | 0 |
2020-06-15 | 0 |
2020-06-16 | 1 |
2020-06-17 | 0 |
2020-06-18 | 0 |
... | ... |
2021-12-27 | 2363 |
2021-12-28 | 2150 |
2021-12-29 | 2159 |
2021-12-30 | 2020 |
2021-12-31 | 1949 |
566 rows × 1 columns
data_asturias.describe()
num_casos | |
---|---|
count | 566.000000 |
mean | 180.913428 |
std | 276.250633 |
min | 0.000000 |
25% | 36.000000 |
50% | 94.500000 |
75% | 225.750000 |
max | 2363.000000 |
= data_asturias.values np_data_asturias
= MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit_transform(np_data_asturias)
scaled_data_asturias print(f'Longitud del conjunto de datos disponible: {len(scaled_data_asturias)}')
Longitud del conjunto de datos disponible: 566
# Since we are going to predict future values based on the X past elements,
# we need to create a list with those historic information for each element
= 90
historic_values = []
scaled_data_asturias_x = []
scaled_data_asturias_y
for num_casos_i in range(historic_values, len(scaled_data_asturias)):
-historic_values):num_casos_i, 0])
scaled_data_asturias_x.append(scaled_data_asturias[(num_casos_i0])
scaled_data_asturias_y.append(scaled_data_asturias[num_casos_i,
# Convert the x_train and y_train to numpy arrays
= np.array(scaled_data_asturias_x)
scaled_data_asturias_x = np.array(scaled_data_asturias_y) scaled_data_asturias_y
# Since the first 90 values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_asturias_y)}')
Longitud datos de entrenamiento con historico: 476
# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 21 days
= scaled_data_asturias_x[0:len(scaled_data_asturias_x)-91]
x_train = scaled_data_asturias_y[0:len(scaled_data_asturias_y)-91]
y_train print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')
= scaled_data_asturias_x[len(scaled_data_asturias_x)-90:len(scaled_data_asturias_x)]
x_test = scaled_data_asturias_y[len(scaled_data_asturias_y)-90:len(scaled_data_asturias_y)]
y_test print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')
Cantidad datos de entrenamiento: x=385 - y=385
Cantidad datos de test: x=90 - y=90
# Reshape the data to feed de recurrent network
= np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
x_train print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
= np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
x_test print("Test data shape:")
print(x_test.shape)
print(y_test.shape)
Train data shape:
(385, 90, 1)
(385,)
Test data shape:
(90, 90, 1)
(90,)
# # Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
= Sequential()
model # Model with Neurons
# Inputshape = neurons -> Timestamps
= x_train.shape[1]
neurons90,
model.add(LSTM(= 'relu',
activation = True,
return_sequences = (x_train.shape[1], 1)))
input_shape 50,
model.add(LSTM(= 'relu',
activation = True))
return_sequences 25,
model.add(LSTM(= 'relu',
activation = False))
return_sequences 5, activation = 'relu'))
model.add(Dense(1)) model.add(Dense(
Build model...
compile(optimizer='adam', loss='mean_squared_error')
model. model.summary()
Model: "sequential_5"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm_15 (LSTM) (None, 90, 90) 33120
lstm_16 (LSTM) (None, 90, 50) 28200
lstm_17 (LSTM) (None, 25) 7600
dense_10 (Dense) (None, 5) 130
dense_11 (Dense) (None, 1) 6
=================================================================
Total params: 69,056
Trainable params: 69,056
Non-trainable params: 0
_________________________________________________________________
# Training the model
# # fit network
= model.fit(x_train,
history
y_train, =1000,
batch_size=50,
epochs= (x_test, y_test),
validation_data = 0) verbose
'loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.plot(history.history[
plt.legend() plt.show()
# Get the predicted values
= model.predict(x_test)
predictions = scaler.inverse_transform(predictions) predictions
WARNING:tensorflow:5 out of the last 13 calls to <function Model.make_predict_function.<locals>.predict_function at 0x7fdc3a579c10> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has reduce_retracing=True option that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for more details.
1/3 [=========>....................] - ETA: 0s
3/3 [==============================] - ETA: 0s
3/3 [==============================] - 0s 31ms/step
= y_test.reshape(-1,1)
y_test = scaler.inverse_transform(y_test) y_test
# Calculate the mean absolute error (MAE)
= mean_absolute_error(y_test, predictions)
mae print('MAE: ' + str(round(mae, 1)))
# Calculate the root mean squarred error (RMSE)
= np.sqrt(mean_squared_error(y_test,predictions))
rmse print('RMSE: ' + str(round(rmse, 1)))
# Calculate the root mean squarred error (RMSE)
= mean_squared_error(y_test,
rmse
predictions,= False)
squared print('RMSE: ' + str(round(rmse, 1)))
MAE: 208.7
RMSE: 387.0
RMSE: 387.0
# Add the difference between the valid and predicted prices
= data_asturias[:(len(x_train)+92)]
train = data_asturias[(len(x_train)+91):] valid
1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True) valid.insert(
# Zoom-in to a closer timeframe
# Date from which on the date is displayed
= "2021-07-15"
display_start_date = valid[valid.index > display_start_date]
valid = train[train.index > display_start_date] train
# Visualize the data
'ggplot')
matplotlib.style.use(= plt.subplots(figsize=(22, 10), sharex=True)
fig, ax1
# Data - Train
= train.index;
xt = train[["num_casos"]]
yt # Data - Test / validation
= valid.index;
xv = valid[["num_casos", "Predictions"]]
yv
# Plot
"Asturias: Predictions vs Real infections", fontsize=20)
plt.title("Nº Cases", fontsize=18)
plt.ylabel(
="blue", linewidth=1.5)
plt.plot(yt, color"Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.plot(yv["Train", "LSTM Predictions", "Test"],
plt.legend([="upper left", fontsize=18)
loc
# Bar plot with the differences
= valid.index
x = valid["Difference"]
y =0.2, color="grey")
plt.bar(x, y, width
plt.grid() plt.show()
Barcelona
= data_covid.loc[data_covid['provincia'] == 'Barcelona']
data_Barcelona = data_Barcelona.set_index('fecha')
data_Barcelona = data_Barcelona.filter(['num_casos'])
data_Barcelona = data_Barcelona['2020-06-14':'2021-12-31']
data_Barcelona data_Barcelona
num_casos | |
---|---|
fecha | |
2020-06-14 | 33 |
2020-06-15 | 62 |
2020-06-16 | 66 |
2020-06-17 | 70 |
2020-06-18 | 68 |
... | ... |
2021-12-27 | 19383 |
2021-12-28 | 20192 |
2021-12-29 | 19361 |
2021-12-30 | 17639 |
2021-12-31 | 16651 |
566 rows × 1 columns
data_Barcelona.describe()
num_casos | |
---|---|
count | 566.000000 |
mean | 1575.480565 |
std | 2281.290383 |
min | 33.000000 |
25% | 544.750000 |
50% | 944.500000 |
75% | 1609.500000 |
max | 20192.000000 |
= data_Barcelona.values np_data_Barcelona
= MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit_transform(np_data_Barcelona)
scaled_data_Barcelona print(f'Longitud del conjunto de datos disponible: {len(scaled_data_Barcelona)}')
Longitud del conjunto de datos disponible: 566
# Since we are going to predict future values based on the 90 past elements,
# we need to create a list with those historic information for each element
= 90
historic_values = []
scaled_data_Barcelona_x = []
scaled_data_Barcelona_y
for num_casos_i in range(historic_values, len(scaled_data_Barcelona)):
-historic_values):num_casos_i, 0])
scaled_data_Barcelona_x.append(scaled_data_Barcelona[(num_casos_i0])
scaled_data_Barcelona_y.append(scaled_data_Barcelona[num_casos_i,
# Convert the x_train and y_train to numpy arrays
= np.array(scaled_data_Barcelona_x)
scaled_data_Barcelona_x = np.array(scaled_data_Barcelona_y) scaled_data_Barcelona_y
# Train data looks like
235] scaled_data_Barcelona_x[
array([0.08507366, 0.07971626, 0.04826628, 0.04151992, 0.07822809,
0.07048961, 0.06289995, 0.05908031, 0.06354482, 0.04345454,
0.03884121, 0.06855499, 0.06309837, 0.06012203, 0.06116375,
0.05208592, 0.0405278 , 0.03422789, 0.06364403, 0.06056848,
0.05337566, 0.045786 , 0.04687733, 0.03323578, 0.02852324,
0.05833623, 0.05064735, 0.04777023, 0.03864279, 0.04330572,
0.03065628, 0.02668783, 0.04707575, 0.04355375, 0.04449625,
0.04251203, 0.04305769, 0.03199563, 0.02842403, 0.05317724,
0.04955603, 0.04266085, 0.04722456, 0.04861352, 0.0360633 ,
0.03090431, 0.06240389, 0.06106454, 0.05441738, 0.05074656,
0.05957637, 0.04181755, 0.03536882, 0.06384245, 0.05734411,
0.05084578, 0.0563024 , 0.03655935, 0.04191676, 0.04196637,
0.04712535, 0.06438811, 0.06275113, 0.05883228, 0.06170941,
0.0475222 , 0.03482316, 0.0651322 , 0.05873307, 0.06696761,
0.05268118, 0.05679845, 0.04008135, 0.03571606, 0.06642195,
0.077484 , 0.05863386, 0.06652116, 0.0546158 , 0.04608364,
0.03933727, 0.07872414, 0.07088645, 0.06389206, 0.06156059,
0.05878268, 0.03462473, 0.02753113, 0.06384245, 0.04682772])
# Test data looks like
235] scaled_data_Barcelona_y[
0.05114340989136366
# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_Barcelona_y)}')
Longitud datos de entrenamiento con historico: 476
# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 90 days
= scaled_data_Barcelona_x[0:len(scaled_data_Barcelona_x)-91]
x_train = scaled_data_Barcelona_y[0:len(scaled_data_Barcelona_y)-91]
y_train print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')
= scaled_data_Barcelona_x[len(scaled_data_Barcelona_x)-90:len(scaled_data_Barcelona_x)]
x_test = scaled_data_Barcelona_y[len(scaled_data_Barcelona_y)-90:len(scaled_data_Barcelona_y)]
y_test print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')
Cantidad datos de entrenamiento: x=385 - y=385
Cantidad datos de test: x=90 - y=90
# Reshape the data to feed de recurrent network
= np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
x_train print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
= np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
x_test print("Test data shape:")
print(x_test.shape)
print(y_test.shape)
Train data shape:
(385, 90, 1)
(385,)
Test data shape:
(90, 90, 1)
(90,)
# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
= Sequential()
model # Model with Neurons
# Inputshape = neurons -> Timestamps
= x_train.shape[1]
neurons90,
model.add(LSTM(= 'relu',
activation = True,
return_sequences = (x_train.shape[1], 1)))
input_shape 50,
model.add(LSTM(= 'relu',
activation = True))
return_sequences 25,
model.add(LSTM(= 'relu',
activation = False))
return_sequences 5, activation = 'relu'))
model.add(Dense(1)) model.add(Dense(
Build model...
compile(optimizer='adam', loss='mean_squared_error')
model. model.summary()
Model: "sequential_6"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm_18 (LSTM) (None, 90, 90) 33120
lstm_19 (LSTM) (None, 90, 50) 28200
lstm_20 (LSTM) (None, 25) 7600
dense_12 (Dense) (None, 5) 130
dense_13 (Dense) (None, 1) 6
=================================================================
Total params: 69,056
Trainable params: 69,056
Non-trainable params: 0
_________________________________________________________________
# Training the model
# fit network
= model.fit(x_train,
history
y_train, = 1000,
batch_size = 50,
epochs = (x_test, y_test),
validation_data = 0) verbose
'loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.plot(history.history[
plt.legend() plt.show()
# Get the predicted values
= model.predict(x_test)
predictions = scaler.inverse_transform(predictions) predictions
1/3 [=========>....................] - ETA: 0s
3/3 [==============================] - ETA: 0s
3/3 [==============================] - 0s 31ms/step
= y_test.reshape(-1,1)
y_test = scaler.inverse_transform(y_test) y_test
# Calculate the mean absolute error (MAE)
= mean_absolute_error(y_test, predictions)
mae print('MAE: ' + str(round(mae, 1)))
# Calculate the root mean squarred error (RMSE)
= np.sqrt(mean_squared_error(y_test,predictions))
rmse print('RMSE: ' + str(round(rmse, 1)))
# Calculate the root mean squarred error (RMSE)
= mean_squared_error(y_test,
rmse
predictions,= False)
squared print('RMSE: ' + str(round(rmse, 1)))
MAE: 1674.3
RMSE: 3525.8
RMSE: 3525.8
# Add the difference between the valid and predicted prices
= data_Barcelona[:(len(x_train)+92)]
train = data_Barcelona[(len(x_train)+91):] valid
1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True) valid.insert(
# Zoom-in to a closer timeframe
# Date from which on the date is displayed
= "2021-07-15"
display_start_date = valid[valid.index > display_start_date]
valid = train[train.index > display_start_date] train
# Visualize the data
'ggplot')
matplotlib.style.use(= plt.subplots(figsize=(22, 10), sharex=True)
fig, ax1
# Data - Train
= train.index;
xt = train[["num_casos"]]
yt # Data - Test / validation
= valid.index;
xv = valid[["num_casos", "Predictions"]]
yv
# Plot
"Barcelona: Predictions vs Real infections", fontsize=20)
plt.title("Nº Cases", fontsize=18)
plt.ylabel(
="blue", linewidth=1.5)
plt.plot(yt, color"Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.plot(yv["Train", "LSTM Predictions", "Test"],
plt.legend([="upper left", fontsize=18)
loc
# Bar plot with the differences
= valid.index
x = valid["Difference"]
y =0.2, color="grey")
plt.bar(x, y, width
plt.grid() plt.show()
Madrid
= data_covid.loc[data_covid['provincia'] == 'Madrid']
data_Madrid = data_Madrid.set_index('fecha')
data_Madrid = data_Madrid.filter(['num_casos'])
data_Madrid = data_Madrid['2020-06-14':'2021-12-31']
data_Madrid data_Madrid
num_casos | |
---|---|
fecha | |
2020-06-14 | 81 |
2020-06-15 | 153 |
2020-06-16 | 91 |
2020-06-17 | 93 |
2020-06-18 | 85 |
... | ... |
2021-12-27 | 22958 |
2021-12-28 | 23811 |
2021-12-29 | 21914 |
2021-12-30 | 20666 |
2021-12-31 | 7556 |
566 rows × 1 columns
data_Madrid.describe()
num_casos | |
---|---|
count | 566.000000 |
mean | 1994.136042 |
std | 2795.419848 |
min | 28.000000 |
25% | 550.250000 |
50% | 1312.500000 |
75% | 2282.500000 |
max | 23811.000000 |
= data_Madrid.values np_data_Madrid
= MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit_transform(np_data_Madrid)
scaled_data_Madrid print(f'Longitud del conjunto de datos disponible: {len(scaled_data_Madrid)}')
Longitud del conjunto de datos disponible: 566
# Since we are going to predict future values based on the X past elements,
# we need to create a list with those historic information for each element
= 90
historic_values = []
scaled_data_Madrid_x = []
scaled_data_Madrid_y
for num_casos_i in range(historic_values, len(scaled_data_Madrid)):
-historic_values):num_casos_i, 0])
scaled_data_Madrid_x.append(scaled_data_Madrid[(num_casos_i0])
scaled_data_Madrid_y.append(scaled_data_Madrid[num_casos_i,
# Convert the x_train and y_train to numpy arrays
= np.array(scaled_data_Madrid_x)
scaled_data_Madrid_x = np.array(scaled_data_Madrid_y) scaled_data_Madrid_y
# Train data looks like
235] scaled_data_Madrid_x[
array([0.11428331, 0.14741622, 0.08796199, 0.06395324, 0.11087752,
0.11272758, 0.08863474, 0.06862044, 0.09216667, 0.06046336,
0.04620948, 0.08363117, 0.07660934, 0.06508851, 0.05621663,
0.06428962, 0.04747088, 0.04288778, 0.06286003, 0.05844511,
0.0505403 , 0.0445276 , 0.05243241, 0.04061725, 0.03943994,
0.05398814, 0.05138124, 0.04734474, 0.04171047, 0.05159147,
0.03830467, 0.03771602, 0.05474499, 0.05365177, 0.04726065,
0.04360257, 0.05201194, 0.04036497, 0.03906151, 0.05360972,
0.05096077, 0.05226422, 0.05625867, 0.04229912, 0.04974141,
0.04700837, 0.07206828, 0.07745028, 0.06176681, 0.05815078,
0.07400244, 0.05239036, 0.05247446, 0.07026027, 0.07009208,
0.07698776, 0.05381996, 0.06079973, 0.07063869, 0.07370811,
0.09796914, 0.09767481, 0.08493462, 0.08094017, 0.08867679,
0.07139553, 0.05890762, 0.09666569, 0.09679183, 0.08846655,
0.08459824, 0.09405878, 0.07101711, 0.0631964 , 0.092461 ,
0.09771686, 0.08350502, 0.07568431, 0.08821427, 0.06386915,
0.05398814, 0.08152882, 0.0865324 , 0.07240466, 0.06172476,
0.07400244, 0.04566287, 0.04406509, 0.04633562, 0.06613968])
# Test data looks like
235] scaled_data_Madrid_y[
0.0639532439137199
# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_Madrid_y)}')
Longitud datos de entrenamiento con historico: 476
# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 91 days
= scaled_data_Madrid_x[0:len(scaled_data_Madrid_x)-91]
x_train = scaled_data_Madrid_y[0:len(scaled_data_Madrid_y)-91]
y_train print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')
= scaled_data_Madrid_x[len(scaled_data_Madrid_x)-90:len(scaled_data_Madrid_x)]
x_test = scaled_data_Madrid_y[len(scaled_data_Madrid_y)-90:len(scaled_data_Madrid_y)]
y_test print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')
Cantidad datos de entrenamiento: x=385 - y=385
Cantidad datos de test: x=90 - y=90
# Reshape the data to feed de recurrent network
= np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
x_train print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
= np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
x_test print("Test data shape:")
print(x_test.shape)
print(y_test.shape)
Train data shape:
(385, 90, 1)
(385,)
Test data shape:
(90, 90, 1)
(90,)
# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
= Sequential()
model # Model with Neurons
# Inputshape = neurons -> Timestamps
= x_train.shape[1]
neurons90,
model.add(LSTM(= 'relu',
activation = True,
return_sequences = (x_train.shape[1], 1)))
input_shape 50,
model.add(LSTM(= 'relu',
activation = True))
return_sequences 25,
model.add(LSTM(= 'relu',
activation = False))
return_sequences 5, activation = 'relu'))
model.add(Dense(1)) model.add(Dense(
Build model...
compile(optimizer='adam', loss='mean_squared_error')
model. model.summary()
Model: "sequential_7"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm_21 (LSTM) (None, 90, 90) 33120
lstm_22 (LSTM) (None, 90, 50) 28200
lstm_23 (LSTM) (None, 25) 7600
dense_14 (Dense) (None, 5) 130
dense_15 (Dense) (None, 1) 6
=================================================================
Total params: 69,056
Trainable params: 69,056
Non-trainable params: 0
_________________________________________________________________
# Training the model
# fit network
= model.fit(x_train,
history
y_train, = 1000,
batch_size = 50,
epochs = (x_test, y_test),
validation_data = 0) verbose
'loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.plot(history.history[
plt.legend() plt.show()
# Get the predicted values
= model.predict(x_test)
predictions = scaler.inverse_transform(predictions) predictions
1/3 [=========>....................] - ETA: 0s
3/3 [==============================] - ETA: 0s
3/3 [==============================] - 0s 31ms/step
= y_test.reshape(-1,1)
y_test = scaler.inverse_transform(y_test) y_test
# Calculate the mean absolute error (MAE)
= mean_absolute_error(y_test, predictions)
mae print('MAE: ' + str(round(mae, 1)))
# Calculate the root mean squarred error (RMSE)
= np.sqrt(mean_squared_error(y_test,predictions))
rmse print('RMSE: ' + str(round(rmse, 1)))
# Calculate the root mean squarred error (RMSE)
= mean_squared_error(y_test,
rmse
predictions,= False)
squared print('RMSE: ' + str(round(rmse, 1)))
MAE: 2521.6
RMSE: 5345.2
RMSE: 5345.2
# Add the difference between the valid and predicted prices
= data_Madrid[:(len(x_train)+92)]
train = data_Madrid[(len(x_train)+91):] valid
1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True) valid.insert(
# Zoom-in to a closer timeframe
# Date from which on the date is displayed
= "2021-07-15"
display_start_date = valid[valid.index > display_start_date]
valid = train[train.index > display_start_date] train
# Visualize the data
'ggplot')
matplotlib.style.use(= plt.subplots(figsize=(22, 10), sharex=True)
fig, ax1
# Data - Train
= train.index;
xt = train[["num_casos"]]
yt # Data - Test / validation
= valid.index;
xv = valid[["num_casos", "Predictions"]]
yv
# Plot
"Madrid: Predictions vs Real infections", fontsize=20)
plt.title("Nº Cases", fontsize=18)
plt.ylabel(
="blue", linewidth=1.5)
plt.plot(yt, color"Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.plot(yv["Train", "LSTM Predictions", "Test"],
plt.legend([="upper left", fontsize=18)
loc
# Bar plot with the differences
= valid.index
x = valid["Difference"]
y =0.2, color="grey")
plt.bar(x, y, width
plt.grid() plt.show()
Malaga
= data_covid.loc[data_covid['provincia'] == 'Málaga']
data_Malaga = data_Malaga.set_index('fecha')
data_Malaga = data_Malaga.filter(['num_casos'])
data_Malaga = data_Malaga['2020-06-14':'2021-12-31']
data_Malaga data_Malaga
num_casos | |
---|---|
fecha | |
2020-06-14 | 2 |
2020-06-15 | 1 |
2020-06-16 | 1 |
2020-06-17 | 2 |
2020-06-18 | 2 |
... | ... |
2021-12-27 | 1627 |
2021-12-28 | 2772 |
2021-12-29 | 3080 |
2021-12-30 | 3075 |
2021-12-31 | 2646 |
566 rows × 1 columns
data_Malaga.describe()
num_casos | |
---|---|
count | 566.000000 |
mean | 339.227915 |
std | 427.261346 |
min | 1.000000 |
25% | 110.000000 |
50% | 193.500000 |
75% | 344.750000 |
max | 3080.000000 |
= data_Malaga.values np_data_Malaga
= MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit_transform(np_data_Malaga)
scaled_data_Malaga print(f'Longitud del conjunto de datos disponible: {len(scaled_data_Malaga)}')
Longitud del conjunto de datos disponible: 566
# Since we are going to predict future values based on the X past elements,
# we need to create a list with those historic information for each element
= 90
historic_values = []
scaled_data_Malaga_x = []
scaled_data_Malaga_y
for num_casos_i in range(historic_values, len(scaled_data_Malaga)):
-historic_values):num_casos_i, 0])
scaled_data_Malaga_x.append(scaled_data_Malaga[(num_casos_i0])
scaled_data_Malaga_y.append(scaled_data_Malaga[num_casos_i,
# Convert the x_train and y_train to numpy arrays
= np.array(scaled_data_Malaga_x)
scaled_data_Malaga_x = np.array(scaled_data_Malaga_y) scaled_data_Malaga_y
# Train data looks like
235] scaled_data_Malaga_x[
array([0.24423514, 0.21890224, 0.15621955, 0.12016889, 0.15329652,
0.14420266, 0.14712569, 0.14647613, 0.11627152, 0.09418642,
0.06722962, 0.09516077, 0.11042546, 0.06852874, 0.06950309,
0.06658006, 0.04871712, 0.03442676, 0.05943488, 0.0490419 ,
0.04936668, 0.04449497, 0.04157194, 0.03962325, 0.02923027,
0.04157194, 0.04741799, 0.05131536, 0.04871712, 0.04287106,
0.02760637, 0.02858071, 0.0422215 , 0.04092238, 0.03994804,
0.03507632, 0.0506658 , 0.03020461, 0.02630724, 0.04092238,
0.03117895, 0.04319584, 0.03702501, 0.0354011 , 0.02923027,
0.02078597, 0.04157194, 0.04514453, 0.03832413, 0.04741799,
0.0490419 , 0.0354011 , 0.02858071, 0.05326405, 0.05293927,
0.05586229, 0.0422215 , 0.04027282, 0.03442676, 0.0490419 ,
0.0659305 , 0.07794739, 0.0659305 , 0.05878532, 0.06203313,
0.05034102, 0.03085417, 0.07047743, 0.05618707, 0.0776226 ,
0.06138357, 0.07080221, 0.04124716, 0.04546931, 0.0659305 ,
0.06690484, 0.05878532, 0.05488795, 0.05293927, 0.04352062,
0.03637545, 0.0591101 , 0.06495615, 0.05975966, 0.05521273,
0.05846054, 0.04481975, 0.04514453, 0.06235791, 0.07697304])
# Test data looks like
235] scaled_data_Malaga_y[
0.07340045469308218
# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_Malaga_y)}')
Longitud datos de entrenamiento con historico: 476
# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 91 days
= scaled_data_Malaga_x[0:len(scaled_data_Malaga_x)-91]
x_train = scaled_data_Malaga_y[0:len(scaled_data_Malaga_y)-91]
y_train print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')
= scaled_data_Malaga_x[len(scaled_data_Malaga_x)-90:len(scaled_data_Malaga_x)]
x_test = scaled_data_Malaga_y[len(scaled_data_Malaga_y)-90:len(scaled_data_Malaga_y)]
y_test print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')
Cantidad datos de entrenamiento: x=385 - y=385
Cantidad datos de test: x=90 - y=90
# Reshape the data to feed de recurrent network
= np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
x_train print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
= np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
x_test print("Test data shape:")
print(x_test.shape)
print(y_test.shape)
Train data shape:
(385, 90, 1)
(385,)
Test data shape:
(90, 90, 1)
(90,)
# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
= Sequential()
model # Model with Neurons
# Inputshape = neurons -> Timestamps
= x_train.shape[1]
neurons90,
model.add(LSTM(= 'relu',
activation = True,
return_sequences = (x_train.shape[1], 1)))
input_shape 50,
model.add(LSTM(= 'relu',
activation = True))
return_sequences 25,
model.add(LSTM(= 'relu',
activation = False))
return_sequences 5, activation = 'relu'))
model.add(Dense(1)) model.add(Dense(
Build model...
compile(optimizer='adam', loss='mean_squared_error')
model. model.summary()
Model: "sequential_8"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm_24 (LSTM) (None, 90, 90) 33120
lstm_25 (LSTM) (None, 90, 50) 28200
lstm_26 (LSTM) (None, 25) 7600
dense_16 (Dense) (None, 5) 130
dense_17 (Dense) (None, 1) 6
=================================================================
Total params: 69,056
Trainable params: 69,056
Non-trainable params: 0
_________________________________________________________________
# Training the model
# fit network
= model.fit(x_train,
history
y_train, = 1000,
batch_size = 50,
epochs = (x_test, y_test),
validation_data = 0) verbose
'loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.plot(history.history[
plt.legend() plt.show()
# Get the predicted values
= model.predict(x_test)
predictions = scaler.inverse_transform(predictions) predictions
1/3 [=========>....................] - ETA: 0s
3/3 [==============================] - ETA: 0s
3/3 [==============================] - 0s 31ms/step
= y_test.reshape(-1,1)
y_test = scaler.inverse_transform(y_test) y_test
# Calculate the mean absolute error (MAE)
= mean_absolute_error(y_test, predictions)
mae print('MAE: ' + str(round(mae, 1)))
# Calculate the root mean squarred error (RMSE)
= np.sqrt(mean_squared_error(y_test,predictions))
rmse print('RMSE: ' + str(round(rmse, 1)))
# Calculate the root mean squarred error (RMSE)
= mean_squared_error(y_test,
rmse
predictions,= False)
squared print('RMSE: ' + str(round(rmse, 1)))
MAE: 197.7
RMSE: 331.7
RMSE: 331.7
# Add the difference between the valid and predicted prices
= data_Malaga[:(len(x_train)+92)]
train = data_Malaga[(len(x_train)+91):] valid
1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True) valid.insert(
# Zoom-in to a closer timeframe
# Date from which on the date is displayed
= "2021-07-15"
display_start_date = valid[valid.index > display_start_date]
valid = train[train.index > display_start_date] train
# Visualize the data
'ggplot')
matplotlib.style.use(= plt.subplots(figsize=(22, 10), sharex=True)
fig, ax1
# Data - Train
= train.index;
xt = train[["num_casos"]]
yt # Data - Test / validation
= valid.index;
xv = valid[["num_casos", "Predictions"]]
yv
# Plot
"Malaga: Predictions vs Real infections", fontsize=20)
plt.title("Nº Cases", fontsize=18)
plt.ylabel(
="blue", linewidth=1.5)
plt.plot(yt, color"Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.plot(yv["Train", "LSTM Predictions", "Test"],
plt.legend([="upper left", fontsize=18)
loc
# Bar plot with the differences
= valid.index
x = valid["Difference"]
y =0.2, color="grey")
plt.bar(x, y, width
plt.grid() plt.show()
Sevilla
= data_covid.loc[data_covid['provincia'] == 'Sevilla']
data_Sevilla = data_Sevilla.set_index('fecha')
data_Sevilla = data_Sevilla.filter(['num_casos'])
data_Sevilla = data_Sevilla['2020-06-14':'2021-12-31']
data_Sevilla data_Sevilla
num_casos | |
---|---|
fecha | |
2020-06-14 | 0 |
2020-06-15 | 2 |
2020-06-16 | 1 |
2020-06-17 | 0 |
2020-06-18 | 0 |
... | ... |
2021-12-27 | 2617 |
2021-12-28 | 3190 |
2021-12-29 | 3692 |
2021-12-30 | 3508 |
2021-12-31 | 2816 |
566 rows × 1 columns
data_Sevilla.describe()
num_casos | |
---|---|
count | 566.000000 |
mean | 383.807420 |
std | 454.714408 |
min | 0.000000 |
25% | 112.750000 |
50% | 260.500000 |
75% | 459.000000 |
max | 3692.000000 |
= data_Sevilla.values np_data_Sevilla
= MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit_transform(np_data_Sevilla)
scaled_data_Sevilla print(f'Longitud del conjunto de datos disponible: {len(scaled_data_Sevilla)}')
Longitud del conjunto de datos disponible: 566
# Since we are going to predict future values based on the X past elements,
# we need to create a list with those historic information for each element
= 90
historic_values = []
scaled_data_Sevilla_x = []
scaled_data_Sevilla_y
for num_casos_i in range(historic_values, len(scaled_data_Sevilla)):
-historic_values):num_casos_i, 0])
scaled_data_Sevilla_x.append(scaled_data_Sevilla[(num_casos_i0])
scaled_data_Sevilla_y.append(scaled_data_Sevilla[num_casos_i,
# Convert the x_train and y_train to numpy arrays
= np.array(scaled_data_Sevilla_x)
scaled_data_Sevilla_x = np.array(scaled_data_Sevilla_y) scaled_data_Sevilla_y
# Train data looks like
235] scaled_data_Sevilla_x[
array([0.18634886, 0.19799567, 0.14626219, 0.0896533 , 0.13894908,
0.1695558 , 0.14572048, 0.12242687, 0.13434453, 0.06500542,
0.05390033, 0.08206934, 0.08667389, 0.09452871, 0.06798483,
0.05904659, 0.04252438, 0.0343987 , 0.06554713, 0.06473456,
0.05444204, 0.0503792 , 0.05796316, 0.04577465, 0.03819068,
0.04712893, 0.0528169 , 0.0663597 , 0.06013001, 0.05119177,
0.03764897, 0.031961 , 0.05119177, 0.05010834, 0.04739978,
0.05119177, 0.05390033, 0.03927411, 0.03737811, 0.06013001,
0.05633803, 0.05606717, 0.05850488, 0.06473456, 0.04658722,
0.04198267, 0.07150596, 0.07340195, 0.05823402, 0.08071506,
0.07773564, 0.05877573, 0.05471289, 0.07990249, 0.08829902,
0.0872156 , 0.09019502, 0.07069339, 0.08775731, 0.08396533,
0.14951246, 0.12432286, 0.12269772, 0.12378115, 0.13028169,
0.11348862, 0.07936078, 0.13299025, 0.12269772, 0.11213434,
0.12107259, 0.12432286, 0.09723727, 0.07042254, 0.09886241,
0.11782232, 0.10157096, 0.10346696, 0.12621885, 0.08342362,
0.07340195, 0.0847779 , 0.08559047, 0.11159263, 0.10292524,
0.08911159, 0.07502709, 0.04821235, 0.08315276, 0.09913326])
# Test data looks like
235] scaled_data_Sevilla_y[
0.10861321776814734
# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_Sevilla_y)}')
Longitud datos de entrenamiento con historico: 476
# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 91 days
= scaled_data_Sevilla_x[0:len(scaled_data_Sevilla_x)-91]
x_train = scaled_data_Sevilla_y[0:len(scaled_data_Sevilla_y)-91]
y_train print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')
= scaled_data_Sevilla_x[len(scaled_data_Sevilla_x)-90:len(scaled_data_Sevilla_x)]
x_test = scaled_data_Sevilla_y[len(scaled_data_Sevilla_y)-90:len(scaled_data_Sevilla_y)]
y_test print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')
Cantidad datos de entrenamiento: x=385 - y=385
Cantidad datos de test: x=90 - y=90
# Reshape the data to feed de recurrent network
= np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
x_train print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
= np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
x_test print("Test data shape:")
print(x_test.shape)
print(y_test.shape)
Train data shape:
(385, 90, 1)
(385,)
Test data shape:
(90, 90, 1)
(90,)
# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
= Sequential()
model # Model with Neurons
# Inputshape = neurons -> Timestamps
= x_train.shape[1]
neurons90,
model.add(LSTM(= 'relu',
activation = True,
return_sequences = (x_train.shape[1], 1)))
input_shape 50,
model.add(LSTM(= 'relu',
activation = True))
return_sequences 25,
model.add(LSTM(= 'relu',
activation = False))
return_sequences 5, activation = 'relu'))
model.add(Dense(1)) model.add(Dense(
Build model...
compile(optimizer='adam', loss='mean_squared_error')
model. model.summary()
Model: "sequential_9"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm_27 (LSTM) (None, 90, 90) 33120
lstm_28 (LSTM) (None, 90, 50) 28200
lstm_29 (LSTM) (None, 25) 7600
dense_18 (Dense) (None, 5) 130
dense_19 (Dense) (None, 1) 6
=================================================================
Total params: 69,056
Trainable params: 69,056
Non-trainable params: 0
_________________________________________________________________
# Training the model
# fit network
= model.fit(x_train,
history
y_train, = 1000,
batch_size = 50,
epochs = (x_test, y_test),
validation_data = 0) verbose
'loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.plot(history.history[
plt.legend() plt.show()
# Get the predicted values
= model.predict(x_test)
predictions = scaler.inverse_transform(predictions) predictions
1/3 [=========>....................] - ETA: 0s
3/3 [==============================] - ETA: 0s
3/3 [==============================] - 0s 32ms/step
= y_test.reshape(-1,1)
y_test = scaler.inverse_transform(y_test) y_test
# Calculate the mean absolute error (MAE)
= mean_absolute_error(y_test, predictions)
mae print('MAE: ' + str(round(mae, 1)))
# Calculate the root mean squarred error (RMSE)
= np.sqrt(mean_squared_error(y_test,predictions))
rmse print('RMSE: ' + str(round(rmse, 1)))
# Calculate the root mean squarred error (RMSE)
= mean_squared_error(y_test,
rmse
predictions,= False)
squared print('RMSE: ' + str(round(rmse, 1)))
MAE: 329.2
RMSE: 595.9
RMSE: 595.9
# Add the difference between the valid and predicted prices
= data_Sevilla[:(len(x_train)+92)]
train = data_Sevilla[(len(x_train)+91):] valid
1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True) valid.insert(
# Zoom-in to a closer timeframe
# Date from which on the date is displayed
= "2021-07-15"
display_start_date = valid[valid.index > display_start_date]
valid = train[train.index > display_start_date] train
# Visualize the data
'ggplot')
matplotlib.style.use(= plt.subplots(figsize=(22, 10), sharex=True)
fig, ax1
# Data - Train
= train.index;
xt = train[["num_casos"]]
yt # Data - Test / validation
= valid.index;
xv = valid[["num_casos", "Predictions"]]
yv
# Plot
"Sevilla: Predictions vs Real infections", fontsize=20)
plt.title("Nº Cases", fontsize=18)
plt.ylabel(
="blue", linewidth=1.5)
plt.plot(yt, color"Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.plot(yv["Train", "LSTM Predictions", "Test"],
plt.legend([="upper left", fontsize=18)
loc
# Bar plot with the differences
= valid.index
x = valid["Difference"]
y =0.2, color="grey")
plt.bar(x, y, width
plt.grid() plt.show()