LSTM multivariate prediction

Import python packages:

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

from keras import Input
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.layers.core import Dense, Activation, Dropout

data_covid = pd.read_csv('data/clean/final_covid_data.csv')
data_covid

	provincia	fecha	num_casos	num_casos_prueba_pcr	num_casos_prueba_test_ac	num_casos_prueba_ag	num_casos_prueba_elisa	num_casos_prueba_desconocida	num_hosp	num_uci	...	ws	ws_max	sol	mob_grocery_pharmacy	mob_parks	mob_residential	mob_retail_recreation	mob_transit_stations	mob_workplaces	mob_flujo
0	Barcelona	2020-01-01	0	0	0	0	0	0	0	0	...	2.5	7.2	4.9	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	Madrid	2020-01-01	1	1	0	0	0	0	1	0	...	0.8	3.6	8.3	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	Málaga	2020-01-01	0	0	0	0	0	0	0	0	...	3.3	6.7	7.7	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	Asturias	2020-01-01	0	0	0	0	0	0	0	0	...	2.5	7.8	7.9	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	Sevilla	2020-01-01	0	0	0	0	0	0	1	0	...	1.9	5.8	9.1	NaN	NaN	NaN	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
4090	Barcelona	2022-03-29	0	0	0	0	0	0	0	0	...	7.2	13.3	0.0	0.0	-13.0	5.0	-24.0	-16.0	-17.0	NaN
4091	Madrid	2022-03-29	6	3	0	3	0	0	0	0	...	2.2	6.1	2.4	1.0	-11.0	4.0	-25.0	-16.0	-16.0	NaN
4092	Málaga	2022-03-29	0	0	0	0	0	0	0	0	...	4.2	10.8	1.5	4.0	-3.0	4.0	-16.0	1.0	-8.0	NaN
4093	Asturias	2022-03-29	0	0	0	0	0	0	1	0	...	2.2	6.7	0.0	-4.0	17.0	3.0	-25.0	-15.0	-12.0	NaN
4094	Sevilla	2022-03-29	0	0	0	0	0	0	0	0	...	2.5	8.3	1.6	3.0	-7.0	2.0	-15.0	-13.0	-5.0	NaN

4095 rows × 26 columns

All the available data

Asturias

data_asturias = data_covid.loc[data_covid['provincia'] == 'Asturias']
data_asturias = data_asturias.set_index('fecha')
data_asturias = data_asturias['2020-06-14':]
data_asturias = data_asturias.filter(['num_casos', 'tmed', 'mob_grocery_pharmacy', 
'mob_parks', 'mob_residential', 'mob_residential', 'mob_transit_stations', 'mob_workplaces'])
data_asturias

	num_casos	tmed	mob_grocery_pharmacy	mob_parks	mob_residential	mob_residential	mob_transit_stations	mob_workplaces
fecha
2020-06-14	0	16.8	-8.0	6.0	4.0	4.0	-43.0	-2.0
2020-06-15	0	16.0	-6.0	39.0	7.0	7.0	-38.0	-27.0
2020-06-16	1	15.6	-6.0	7.0	9.0	9.0	-43.0	-29.0
2020-06-17	0	15.6	-7.0	20.0	8.0	8.0	-42.0	-28.0
2020-06-18	0	15.1	-4.0	68.0	7.0	7.0	-39.0	-28.0
...	...	...	...	...	...	...	...	...
2022-03-25	244	11.6	-2.0	24.0	2.0	2.0	-13.0	-12.0
2022-03-26	432	12.0	-13.0	27.0	1.0	1.0	-19.0	-13.0
2022-03-27	1	14.2	-7.5	33.0	-1.0	-1.0	-13.0	-11.0
2022-03-28	9	13.8	-2.0	45.0	1.0	1.0	-9.0	-10.0
2022-03-29	0	11.7	-4.0	17.0	3.0	3.0	-15.0	-12.0

654 rows × 8 columns

data_asturias.describe()

	num_casos	tmed	mob_grocery_pharmacy	mob_parks	mob_residential	mob_residential	mob_transit_stations	mob_workplaces
count	654.00000	654.000000	654.00000	654.000000	654.000000	654.000000	654.000000	654.000000
mean	312.11315	13.909098	3.83792	57.725535	3.695719	3.695719	-12.614679	-20.651376
std	565.17985	4.198047	15.05797	68.024228	4.802763	4.802763	14.838430	12.754756
min	0.00000	3.300000	-87.00000	-63.000000	-8.000000	-8.000000	-67.000000	-80.000000
25%	43.50000	10.800000	-0.50000	13.000000	1.000000	1.000000	-22.000000	-26.000000
50%	117.00000	13.900000	5.00000	39.000000	3.000000	3.000000	-13.000000	-18.500000
75%	323.75000	17.600000	10.00000	83.500000	6.000000	6.000000	-4.000000	-14.000000
max	3827.00000	25.100000	42.00000	333.000000	26.000000	26.000000	26.000000	12.000000

np_data_asturias = data_asturias.values

# Train dataset
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data_asturias = scaler.fit_transform(np_data_asturias)
print(f'Longitud del conjunto de datos disponible: {len(scaled_data_asturias)}')

Longitud del conjunto de datos disponible: 654

# Since we are going to predict future values based on the 90 past elements, 
# we need to create a list with those historic information for each element
historic_values = 90
scaled_data_asturias_x = []
scaled_data_asturias_y = []

for i in range(historic_values, len(scaled_data_asturias)):
    scaled_data_asturias_x.append(scaled_data_asturias[(i-historic_values):i, :])
    scaled_data_asturias_y.append(scaled_data_asturias[i, 0])

# Convert the x_train and y_train to numpy arrays
scaled_data_asturias_x = np.array(scaled_data_asturias_x)
scaled_data_asturias_y = np.array(scaled_data_asturias_y)

# Once predicted, we are going to need a exclusive scaler for num_cases
scaler_pred = MinMaxScaler(feature_range=(0, 1))
df_cases_asturias = pd.DataFrame(data_asturias['num_casos'])
scaled_data_asturias_pred = scaler_pred.fit_transform(df_cases_asturias)

# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_asturias_y)}')

Longitud datos de entrenamiento con historico: 564

# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 90 days
x_train = scaled_data_asturias_x[0:len(scaled_data_asturias_x)-91]
y_train = scaled_data_asturias_y[0:len(scaled_data_asturias_y)-91]
print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')

x_test = scaled_data_asturias_x[len(scaled_data_asturias_x)-90:len(scaled_data_asturias_x)]
y_test = scaled_data_asturias_y[len(scaled_data_asturias_y)-90:len(scaled_data_asturias_y)]
print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')

Cantidad datos de entrenamiento: x=473 - y=473
Cantidad datos de test: x=90 - y=90

# Reshape the data to feed de recurrent network
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 8))
print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 8))
print("Test data shape:")
print(x_test.shape)
print(y_test.shape)

Train data shape:
(473, 90, 8)
(473,)
Test data shape:
(90, 90, 8)
(90,)

# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
model = Sequential()
# Model with Neurons 
# Inputshape = neurons -> Timestamps
neurons= x_train.shape[1]
model.add(LSTM(90, 
               activation = 'relu',
               return_sequences = True, 
               input_shape = (x_train.shape[1], 8))) 
model.add(LSTM(50, 
               activation = 'relu',
               return_sequences = True)) 
model.add(LSTM(25, 
               activation = 'relu',
               return_sequences = False)) 
model.add(Dense(5, activation = 'relu'))
model.add(Dense(1))

Build model...

2022-05-25 01:28:48.052818: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.

model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()

Model: "sequential"

_________________________________________________________________

 Layer (type)                Output Shape              Param #

=================================================================

 lstm (LSTM)                 (None, 90, 90)            35640

 lstm_1 (LSTM)               (None, 90, 50)            28200

 lstm_2 (LSTM)               (None, 25)                7600

 dense (Dense)               (None, 5)                 130

 dense_1 (Dense)             (None, 1)                 6

=================================================================

Total params: 71,576

Trainable params: 71,576

Non-trainable params: 0

_________________________________________________________________

# Training the model
# fit network
history = model.fit(x_train, 
                    y_train, 
                    batch_size=1000, 
                    epochs=30, 
                    validation_data = (x_test, y_test), 
                    verbose = 0)

plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()

# Get the predicted values
predictions = model.predict(x_test)
predictions = scaler_pred.inverse_transform(predictions)

1/3 [=========>....................] - ETA: 0s

3/3 [==============================] - ETA: 0s

3/3 [==============================] - 0s 30ms/step

y_test = y_test.reshape(-1,1)
y_test = scaler_pred.inverse_transform(y_test)

# Calculate the mean absolute error (MAE)
mae = mean_absolute_error(y_test, predictions)
print('MAE: ' + str(round(mae, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test,predictions))
print('RMSE: ' + str(round(rmse, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = mean_squared_error(y_test, 
                          predictions,
                          squared = False)
print('RMSE: ' + str(round(rmse, 1)))

MAE: 738.5
RMSE: 1169.8
RMSE: 1169.8

# Add the difference between the valid and predicted prices
train = data_asturias[:(len(x_train)+92)]
valid = data_asturias[(len(x_train)+91):]

valid.insert(1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True)

# Zoom-in to a closer timeframe
# Date from which on the date is displayed
display_start_date = "2021-10-31" 
valid = valid[valid.index > display_start_date]
train = train[train.index > display_start_date]

# Visualize the data
matplotlib.style.use('ggplot')
fig, ax1 = plt.subplots(figsize=(22, 10), sharex=True)

# Data - Train
xt = train.index; 
yt = train[["num_casos"]]
# Data - Test / validation 
xv = valid.index; 
yv = valid[["num_casos", "Predictions"]]

# Plot
plt.title("Asturias: Predictions vs Real infections", fontsize=20)
plt.ylabel("Nº Cases", fontsize=18)

plt.plot(yt, color="blue", linewidth=1.5)
plt.plot(yv["Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.legend(["Train", "LSTM Predictions", "Test"], 
           loc="upper left", fontsize=18)

# Bar plot with the differences
x = valid.index
y = valid["Difference"]
plt.bar(x, y, width=0.2, color="grey")
plt.grid()
plt.show()

Barcelona

data_barcelona = data_covid.loc[data_covid['provincia'] == 'Barcelona']
data_barcelona = data_barcelona.set_index('fecha')
data_barcelona = data_barcelona['2020-06-14':]
data_barcelona = data_barcelona.filter(['num_casos', 'tmed', 'mob_grocery_pharmacy', 
'mob_parks', 'mob_residential', 'mob_residential', 'mob_transit_stations', 'mob_workplaces'])
data_barcelona

	num_casos	tmed	mob_grocery_pharmacy	mob_parks	mob_residential	mob_residential	mob_transit_stations	mob_workplaces
fecha
2020-06-14	33	20.6	-20.0	15.0	6.0	6.0	-31.0	-6.0
2020-06-15	62	21.2	-9.0	17.0	14.0	14.0	-34.0	-41.0
2020-06-16	66	20.8	-10.0	-16.0	16.0	16.0	-39.0	-41.0
2020-06-17	70	20.3	-4.0	14.0	15.0	15.0	-33.0	-40.0
2020-06-18	68	18.8	-8.0	-6.0	16.0	16.0	-38.0	-41.0
...	...	...	...	...	...	...	...	...
2022-03-25	598	14.4	3.0	-11.0	4.0	4.0	-18.0	-16.0
2022-03-26	345	12.8	-11.0	-33.0	3.0	3.0	-22.0	-12.0
2022-03-27	252	15.6	-10.0	-3.0	0.0	0.0	-12.0	-10.0
2022-03-28	688	14.0	-1.0	-2.0	4.0	4.0	-15.0	-15.0
2022-03-29	0	13.0	0.0	-13.0	5.0	5.0	-16.0	-17.0

654 rows × 8 columns

data_barcelona.describe()

	num_casos	tmed	mob_grocery_pharmacy	mob_parks	mob_residential	mob_residential	mob_transit_stations	mob_workplaces
count	654.000000	654.000000	654.000000	654.000000	654.000000	654.000000	654.000000	654.000000
mean	2605.048930	17.294801	0.180428	-5.547401	7.178899	7.178899	-24.041284	-27.391437
std	4823.001644	6.163636	16.568328	15.365483	5.019727	5.019727	11.241460	15.949374
min	0.000000	4.300000	-83.000000	-76.000000	-6.000000	-6.000000	-72.000000	-87.000000
25%	597.250000	12.200000	-7.000000	-15.000000	4.000000	4.000000	-32.000000	-34.000000
50%	1030.500000	16.300000	2.000000	-5.000000	7.000000	7.000000	-24.000000	-25.000000
75%	2243.750000	23.600000	10.000000	6.000000	10.000000	10.000000	-15.000000	-18.000000
max	34701.000000	28.400000	58.000000	33.000000	32.000000	32.000000	-2.000000	6.000000

np_data_barcelona = data_barcelona.values

# Train dataset
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data_barcelona = scaler.fit_transform(np_data_barcelona)
print(f'Longitud del conjunto de datos disponible: {len(scaled_data_barcelona)}')

Longitud del conjunto de datos disponible: 654

# Since we are going to predict future values based on the 90 past elements, 
# we need to create a list with those historic information for each element
historic_values = 90
scaled_data_barcelona_x = []
scaled_data_barcelona_y = []

for i in range(historic_values, len(scaled_data_barcelona)):
    scaled_data_barcelona_x.append(scaled_data_barcelona[(i-historic_values):i, :])
    scaled_data_barcelona_y.append(scaled_data_barcelona[i, 0])

# Convert the x_train and y_train to numpy arrays
scaled_data_barcelona_x = np.array(scaled_data_barcelona_x)
scaled_data_barcelona_y = np.array(scaled_data_barcelona_y)

# Once predicted, we are going to need a exclusive scaler for num_cases
scaler_pred = MinMaxScaler(feature_range=(0, 1))
df_cases_barcelona = pd.DataFrame(data_barcelona['num_casos'])
scaled_data_barcelona_pred = scaler_pred.fit_transform(df_cases_barcelona)

# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_barcelona_y)}')

Longitud datos de entrenamiento con historico: 564

# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 90 days
x_train = scaled_data_barcelona_x[0:len(scaled_data_barcelona_x)-91]
y_train = scaled_data_barcelona_y[0:len(scaled_data_barcelona_y)-91]
print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')

x_test = scaled_data_barcelona_x[len(scaled_data_barcelona_x)-90:len(scaled_data_barcelona_x)]
y_test = scaled_data_barcelona_y[len(scaled_data_barcelona_y)-90:len(scaled_data_barcelona_y)]
print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')

Cantidad datos de entrenamiento: x=473 - y=473
Cantidad datos de test: x=90 - y=90

# Reshape the data to feed de recurrent network
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 8))
print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 8))
print("Test data shape:")
print(x_test.shape)
print(y_test.shape)

Train data shape:
(473, 90, 8)
(473,)
Test data shape:
(90, 90, 8)
(90,)

# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
model = Sequential()
# Model with Neurons 
# Inputshape = neurons -> Timestamps
neurons= x_train.shape[1]
model.add(LSTM(90, 
               activation = 'relu',
               return_sequences = True, 
               input_shape = (x_train.shape[1], 8))) 
model.add(LSTM(50, 
               activation = 'relu',
               return_sequences = True)) 
model.add(LSTM(25, 
               activation = 'relu',
               return_sequences = False)) 
model.add(Dense(5, activation = 'relu'))
model.add(Dense(1))

Build model...

model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()

Model: "sequential_1"

_________________________________________________________________

 Layer (type)                Output Shape              Param #

=================================================================

 lstm_3 (LSTM)               (None, 90, 90)            35640

 lstm_4 (LSTM)               (None, 90, 50)            28200

 lstm_5 (LSTM)               (None, 25)                7600

 dense_2 (Dense)             (None, 5)                 130

 dense_3 (Dense)             (None, 1)                 6

=================================================================

Total params: 71,576

Trainable params: 71,576

Non-trainable params: 0

_________________________________________________________________

# Training the model
# fit network
history = model.fit(x_train, 
                    y_train, 
                    batch_size=1000, 
                    epochs=30, 
                    validation_data = (x_test, y_test), 
                    verbose = 0)

plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()

# Get the predicted values
predictions = model.predict(x_test)
predictions = scaler_pred.inverse_transform(predictions)

1/3 [=========>....................] - ETA: 0s

3/3 [==============================] - ETA: 0s

3/3 [==============================] - 0s 31ms/step

y_test = y_test.reshape(-1,1)
y_test = scaler_pred.inverse_transform(y_test)

# Calculate the mean absolute error (MAE)
mae = mean_absolute_error(y_test, predictions)
print('MAE: ' + str(round(mae, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test,predictions))
print('RMSE: ' + str(round(rmse, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = mean_squared_error(y_test, 
                          predictions,
                          squared = False)
print('RMSE: ' + str(round(rmse, 1)))

MAE: 6872.2
RMSE: 11041.0
RMSE: 11041.0

# Add the difference between the valid and predicted prices
train = data_barcelona[:(len(x_train)+92)]
valid = data_barcelona[(len(x_train)+91):]

valid.insert(1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True)

# Zoom-in to a closer timeframe
# Date from which on the date is displayed
display_start_date = "2021-10-31" 
valid = valid[valid.index > display_start_date]
train = train[train.index > display_start_date]

# Visualize the data
matplotlib.style.use('ggplot')
fig, ax1 = plt.subplots(figsize=(22, 10), sharex=True)

# Data - Train
xt = train.index; 
yt = train[["num_casos"]]
# Data - Test / validation 
xv = valid.index; 
yv = valid[["num_casos", "Predictions"]]

# Plot
plt.title("barcelona: Predictions vs Real infections", fontsize=20)
plt.ylabel("Nº Cases", fontsize=18)

plt.plot(yt, color="blue", linewidth=1.5)
plt.plot(yv["Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.legend(["Train", "LSTM Predictions", "Test"], 
           loc="upper left", fontsize=18)

# Bar plot with the differences
x = valid.index
y = valid["Difference"]
plt.bar(x, y, width=0.2, color="grey")
plt.grid()
plt.show()

Madrid

data_madrid = data_covid.loc[data_covid['provincia'] == 'Madrid']
data_madrid = data_madrid.set_index('fecha')
data_madrid = data_madrid['2020-06-14':]
data_madrid = data_madrid.filter(['num_casos', 'tmed', 'mob_grocery_pharmacy', 
'mob_parks', 'mob_residential', 'mob_residential', 'mob_transit_stations', 'mob_workplaces'])
data_madrid

	num_casos	tmed	mob_grocery_pharmacy	mob_parks	mob_residential	mob_residential	mob_transit_stations	mob_workplaces
fecha
2020-06-14	81	19.0	-15.0	25.0	7.0	7.0	-43.0	-6.0
2020-06-15	153	20.7	-8.0	34.0	16.0	16.0	-43.0	-47.0
2020-06-16	91	21.0	-8.0	21.0	17.0	17.0	-43.0	-47.0
2020-06-17	93	20.2	-5.0	34.0	17.0	17.0	-42.0	-47.0
2020-06-18	85	21.9	-7.0	30.0	16.0	16.0	-44.0	-47.0
...	...	...	...	...	...	...	...	...
2022-03-25	356	11.0	-1.0	-37.0	5.0	5.0	-19.0	-17.0
2022-03-26	303	10.4	-7.0	-27.0	1.0	1.0	-15.0	-5.0
2022-03-27	77	10.6	-3.0	-17.0	0.0	0.0	-17.0	-2.0
2022-03-28	839	10.2	2.0	-3.0	4.0	4.0	-15.0	-17.0
2022-03-29	6	13.0	1.0	-11.0	4.0	4.0	-16.0	-16.0

654 rows × 8 columns

data_madrid.describe()

	num_casos	tmed	mob_grocery_pharmacy	mob_parks	mob_residential	mob_residential	mob_transit_stations	mob_workplaces
count	654.000000	654.000000	654.000000	654.000000	654.000000	654.000000	654.000000	654.000000
mean	2392.562691	14.909786	-2.608563	-4.874618	6.978593	6.978593	-30.003058	-27.324159
std	3390.272836	8.073496	15.108581	20.826186	5.672763	5.672763	12.389295	18.988563
min	6.000000	-6.200000	-84.000000	-62.000000	-6.000000	-6.000000	-73.000000	-86.000000
25%	662.750000	8.600000	-9.000000	-17.750000	4.000000	4.000000	-39.000000	-40.000000
50%	1413.000000	13.200000	-1.000000	-4.000000	7.000000	7.000000	-31.000000	-26.000000
75%	2475.750000	21.800000	8.000000	11.000000	10.000000	10.000000	-21.000000	-16.000000
max	23811.000000	32.700000	50.000000	77.000000	31.000000	31.000000	1.000000	20.000000

np_data_madrid = data_madrid.values

# Train dataset
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data_madrid = scaler.fit_transform(np_data_madrid)
print(f'Longitud del conjunto de datos disponible: {len(scaled_data_madrid)}')

Longitud del conjunto de datos disponible: 654

# Since we are going to predict future values based on the 90 past elements, 
# we need to create a list with those historic information for each element
historic_values = 90
scaled_data_madrid_x = []
scaled_data_madrid_y = []

for i in range(historic_values, len(scaled_data_madrid)):
    scaled_data_madrid_x.append(scaled_data_madrid[(i-historic_values):i, :])
    scaled_data_madrid_y.append(scaled_data_madrid[i, 0])

# Convert the x_train and y_train to numpy arrays
scaled_data_madrid_x = np.array(scaled_data_madrid_x)
scaled_data_madrid_y = np.array(scaled_data_madrid_y)

# Once predicted, we are going to need a exclusive scaler for num_cases
scaler_pred = MinMaxScaler(feature_range=(0, 1))
df_cases_madrid = pd.DataFrame(data_madrid['num_casos'])
scaled_data_madrid_pred = scaler_pred.fit_transform(df_cases_madrid)

# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_madrid_y)}')

Longitud datos de entrenamiento con historico: 564

# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 90 days
x_train = scaled_data_madrid_x[0:len(scaled_data_madrid_x)-91]
y_train = scaled_data_madrid_y[0:len(scaled_data_madrid_y)-91]
print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')

x_test = scaled_data_madrid_x[len(scaled_data_madrid_x)-90:len(scaled_data_madrid_x)]
y_test = scaled_data_madrid_y[len(scaled_data_madrid_y)-90:len(scaled_data_madrid_y)]
print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')

Cantidad datos de entrenamiento: x=473 - y=473
Cantidad datos de test: x=90 - y=90

# Reshape the data to feed de recurrent network
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 8))
print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 8))
print("Test data shape:")
print(x_test.shape)
print(y_test.shape)

Train data shape:
(473, 90, 8)
(473,)
Test data shape:
(90, 90, 8)
(90,)

# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
model = Sequential()
# Model with Neurons 
# Inputshape = neurons -> Timestamps
neurons= x_train.shape[1]
model.add(LSTM(90, 
               activation = 'relu',
               return_sequences = True, 
               input_shape = (x_train.shape[1], 8))) 
model.add(LSTM(50, 
               activation = 'relu',
               return_sequences = True)) 
model.add(LSTM(25, 
               activation = 'relu',
               return_sequences = False)) 
model.add(Dense(5, activation = 'relu'))
model.add(Dense(1))

Build model...

model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()

Model: "sequential_2"

_________________________________________________________________

 Layer (type)                Output Shape              Param #

=================================================================

 lstm_6 (LSTM)               (None, 90, 90)            35640

 lstm_7 (LSTM)               (None, 90, 50)            28200

 lstm_8 (LSTM)               (None, 25)                7600

 dense_4 (Dense)             (None, 5)                 130

 dense_5 (Dense)             (None, 1)                 6

=================================================================

Total params: 71,576

Trainable params: 71,576

Non-trainable params: 0

_________________________________________________________________

# Training the model
# fit network
history = model.fit(x_train, 
                    y_train, 
                    batch_size=1000, 
                    epochs=30, 
                    validation_data = (x_test, y_test), 
                    verbose = 0)

plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()

# Get the predicted values
predictions = model.predict(x_test)
predictions = scaler_pred.inverse_transform(predictions)

1/3 [=========>....................] - ETA: 0s

3/3 [==============================] - ETA: 0s

3/3 [==============================] - 0s 30ms/step

y_test = y_test.reshape(-1,1)
y_test = scaler_pred.inverse_transform(y_test)

# Calculate the mean absolute error (MAE)
mae = mean_absolute_error(y_test, predictions)
print('MAE: ' + str(round(mae, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test,predictions))
print('RMSE: ' + str(round(rmse, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = mean_squared_error(y_test, 
                          predictions,
                          squared = False)
print('RMSE: ' + str(round(rmse, 1)))

MAE: 4531.8
RMSE: 7047.1
RMSE: 7047.1

# Add the difference between the valid and predicted prices
train = data_madrid[:(len(x_train)+92)]
valid = data_madrid[(len(x_train)+91):]

valid.insert(1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True)

# Zoom-in to a closer timeframe
# Date from which on the date is displayed
display_start_date = "2021-10-31" 
valid = valid[valid.index > display_start_date]
train = train[train.index > display_start_date]

# Visualize the data
matplotlib.style.use('ggplot')
fig, ax1 = plt.subplots(figsize=(22, 10), sharex=True)

# Data - Train
xt = train.index; 
yt = train[["num_casos"]]
# Data - Test / validation 
xv = valid.index; 
yv = valid[["num_casos", "Predictions"]]

# Plot
plt.title("madrid: Predictions vs Real infections", fontsize=20)
plt.ylabel("Nº Cases", fontsize=18)

plt.plot(yt, color="blue", linewidth=1.5)
plt.plot(yv["Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.legend(["Train", "LSTM Predictions", "Test"], 
           loc="upper left", fontsize=18)

# Bar plot with the differences
x = valid.index
y = valid["Difference"]
plt.bar(x, y, width=0.2, color="grey")
plt.grid()
plt.show()

Málaga

data_malaga = data_covid.loc[data_covid['provincia'] == 'Málaga']
data_malaga = data_malaga.set_index('fecha')
data_malaga = data_malaga['2020-06-14':]
data_malaga = data_malaga.filter(['num_casos', 'tmed', 'mob_grocery_pharmacy', 
'mob_parks', 'mob_residential', 'mob_residential', 'mob_transit_stations', 'mob_workplaces'])
data_malaga

	num_casos	tmed	mob_grocery_pharmacy	mob_parks	mob_residential	mob_residential	mob_transit_stations	mob_workplaces
fecha
2020-06-14	2	26.6	-20.0	-15.0	2.0	2.0	-53.0	-7.0
2020-06-15	1	27.2	-16.0	-9.0	9.0	9.0	-44.0	-33.0
2020-06-16	1	27.8	-13.0	-4.0	9.0	9.0	-40.0	-33.0
2020-06-17	2	26.9	-13.0	1.0	9.0	9.0	-41.0	-33.0
2020-06-18	2	21.6	-12.0	3.0	9.0	9.0	-42.0	-33.0
...	...	...	...	...	...	...	...	...
2022-03-25	565	13.8	-3.0	-32.0	8.0	8.0	-18.0	-12.0
2022-03-26	79	14.5	-1.0	-1.0	3.0	3.0	-13.0	-4.0
2022-03-27	65	15.4	5.0	-18.0	2.0	2.0	0.0	0.0
2022-03-28	39	16.3	6.0	-4.0	3.0	3.0	-4.0	-7.0
2022-03-29	0	16.8	4.0	-3.0	4.0	4.0	1.0	-8.0

654 rows × 8 columns

data_malaga.describe()

	num_casos	tmed	mob_grocery_pharmacy	mob_parks	mob_residential	mob_residential	mob_transit_stations	mob_workplaces
count	654.000000	654.000000	654.000000	654.000000	654.000000	654.000000	654.000000	654.00000
mean	410.206422	19.511162	8.776758	13.570336	4.529052	4.529052	-18.853211	-18.04893
std	489.452348	5.721942	26.696909	39.148014	4.064365	4.064365	18.183256	14.55877
min	0.000000	6.200000	-80.000000	-63.000000	-4.000000	-4.000000	-70.000000	-79.00000
25%	122.250000	14.725000	-4.000000	-13.000000	2.000000	2.000000	-32.000000	-26.00000
50%	215.500000	18.400000	6.000000	5.000000	4.000000	4.000000	-21.000000	-17.00000
75%	475.250000	24.500000	15.000000	30.000000	6.750000	6.750000	-4.000000	-11.00000
max	3080.000000	34.500000	156.000000	152.000000	22.000000	22.000000	20.000000	29.00000

np_data_malaga = data_malaga.values

# Train dataset
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data_malaga = scaler.fit_transform(np_data_malaga)
print(f'Longitud del conjunto de datos disponible: {len(scaled_data_malaga)}')

Longitud del conjunto de datos disponible: 654

# Since we are going to predict future values based on the 90 past elements, 
# we need to create a list with those historic information for each element
historic_values = 90
scaled_data_malaga_x = []
scaled_data_malaga_y = []

for i in range(historic_values, len(scaled_data_malaga)):
    scaled_data_malaga_x.append(scaled_data_malaga[(i-historic_values):i, :])
    scaled_data_malaga_y.append(scaled_data_malaga[i, 0])

# Convert the x_train and y_train to numpy arrays
scaled_data_malaga_x = np.array(scaled_data_malaga_x)
scaled_data_malaga_y = np.array(scaled_data_malaga_y)

# Once predicted, we are going to need a exclusive scaler for num_cases
scaler_pred = MinMaxScaler(feature_range=(0, 1))
df_cases_malaga = pd.DataFrame(data_malaga['num_casos'])
scaled_data_malaga_pred = scaler_pred.fit_transform(df_cases_malaga)

# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_malaga_y)}')

Longitud datos de entrenamiento con historico: 564

# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 90 days
x_train = scaled_data_malaga_x[0:len(scaled_data_malaga_x)-91]
y_train = scaled_data_malaga_y[0:len(scaled_data_malaga_y)-91]
print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')

x_test = scaled_data_malaga_x[len(scaled_data_malaga_x)-90:len(scaled_data_malaga_x)]
y_test = scaled_data_malaga_y[len(scaled_data_malaga_y)-90:len(scaled_data_malaga_y)]
print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')

Cantidad datos de entrenamiento: x=473 - y=473
Cantidad datos de test: x=90 - y=90

# Reshape the data to feed de recurrent network
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 8))
print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 8))
print("Test data shape:")
print(x_test.shape)
print(y_test.shape)

Train data shape:
(473, 90, 8)
(473,)
Test data shape:
(90, 90, 8)
(90,)

# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
model = Sequential()
# Model with Neurons 
# Inputshape = neurons -> Timestamps
neurons= x_train.shape[1]
model.add(LSTM(90, 
               activation = 'relu',
               return_sequences = True, 
               input_shape = (x_train.shape[1], 8))) 
model.add(LSTM(50, 
               activation = 'relu',
               return_sequences = True)) 
model.add(LSTM(25, 
               activation = 'relu',
               return_sequences = False)) 
model.add(Dense(5, activation = 'relu'))
model.add(Dense(1))

Build model...

model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()

Model: "sequential_3"

_________________________________________________________________

 Layer (type)                Output Shape              Param #

=================================================================

 lstm_9 (LSTM)               (None, 90, 90)            35640

 lstm_10 (LSTM)              (None, 90, 50)            28200

 lstm_11 (LSTM)              (None, 25)                7600

 dense_6 (Dense)             (None, 5)                 130

 dense_7 (Dense)             (None, 1)                 6

=================================================================

Total params: 71,576

Trainable params: 71,576

Non-trainable params: 0

_________________________________________________________________

# Training the model
# fit network
history = model.fit(x_train, 
                    y_train, 
                    batch_size=1000, 
                    epochs=30, 
                    validation_data = (x_test, y_test), 
                    verbose = 0)

plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()

# Get the predicted values
predictions = model.predict(x_test)
predictions = scaler_pred.inverse_transform(predictions)

1/3 [=========>....................] - ETA: 0s

3/3 [==============================] - ETA: 0s

3/3 [==============================] - 0s 31ms/step

y_test = y_test.reshape(-1,1)
y_test = scaler_pred.inverse_transform(y_test)

# Calculate the mean absolute error (MAE)
mae = mean_absolute_error(y_test, predictions)
print('MAE: ' + str(round(mae, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test,predictions))
print('RMSE: ' + str(round(rmse, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = mean_squared_error(y_test, 
                          predictions,
                          squared = False)
print('RMSE: ' + str(round(rmse, 1)))

MAE: 322.4
RMSE: 508.6
RMSE: 508.6

# Add the difference between the valid and predicted prices
train = data_malaga[:(len(x_train)+92)]
valid = data_malaga[(len(x_train)+91):]

valid.insert(1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True)

# Zoom-in to a closer timeframe
# Date from which on the date is displayed
display_start_date = "2021-10-31" 
valid = valid[valid.index > display_start_date]
train = train[train.index > display_start_date]

# Visualize the data
matplotlib.style.use('ggplot')
fig, ax1 = plt.subplots(figsize=(22, 10), sharex=True)

# Data - Train
xt = train.index; 
yt = train[["num_casos"]]
# Data - Test / validation 
xv = valid.index; 
yv = valid[["num_casos", "Predictions"]]

# Plot
plt.title("malaga: Predictions vs Real infections", fontsize=20)
plt.ylabel("Nº Cases", fontsize=18)

plt.plot(yt, color="blue", linewidth=1.5)
plt.plot(yv["Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.legend(["Train", "LSTM Predictions", "Test"], 
           loc="upper left", fontsize=18)

# Bar plot with the differences
x = valid.index
y = valid["Difference"]
plt.bar(x, y, width=0.2, color="grey")
plt.grid()
plt.show()

Sevilla

data_sevilla = data_covid.loc[data_covid['provincia'] == 'Sevilla']
data_sevilla = data_sevilla.set_index('fecha')
data_sevilla = data_sevilla['2020-06-14':]
data_sevilla = data_sevilla.filter(['num_casos', 'tmed', 'mob_grocery_pharmacy', 
'mob_parks', 'mob_residential', 'mob_residential', 'mob_transit_stations', 'mob_workplaces'])
data_sevilla

	num_casos	tmed	mob_grocery_pharmacy	mob_parks	mob_residential	mob_residential	mob_transit_stations	mob_workplaces
fecha
2020-06-14	0	22.200000	-24.0	-36.0	0.0	0.0	-51.0	0.0
2020-06-15	2	23.088889	-14.0	-13.0	9.0	9.0	-41.0	-33.0
2020-06-16	1	23.977778	-10.0	-4.0	9.0	9.0	-37.0	-32.0
2020-06-17	0	24.866667	-10.0	-7.0	9.0	9.0	-37.0	-32.0
2020-06-18	0	25.755556	-12.0	-13.0	9.0	9.0	-41.0	-33.0
...	...	...	...	...	...	...	...	...
2022-03-25	365	14.100000	2.0	-20.0	4.0	4.0	-14.0	-7.0
2022-03-26	67	15.200000	-9.0	2.0	0.0	0.0	-12.0	-5.0
2022-03-27	24	15.900000	2.0	-13.0	-1.0	-1.0	-10.0	-7.0
2022-03-28	12	17.100000	2.0	-5.0	2.0	2.0	-10.0	-6.0
2022-03-29	0	16.000000	3.0	-7.0	2.0	2.0	-13.0	-5.0

654 rows × 8 columns

data_sevilla.describe()

	num_casos	tmed	mob_grocery_pharmacy	mob_parks	mob_residential	mob_residential	mob_transit_stations	mob_workplaces
count	654.000000	654.000000	654.000000	654.000000	654.000000	654.000000	654.000000	654.000000
mean	432.978593	19.663609	-0.709480	-14.958716	3.915902	3.915902	-25.707951	-20.347095
std	500.618517	6.949955	18.851349	17.982375	4.397963	4.397963	14.307026	14.434846
min	0.000000	5.500000	-79.000000	-70.000000	-7.000000	-7.000000	-71.000000	-79.000000
25%	127.250000	13.800000	-8.000000	-27.000000	1.000000	1.000000	-36.000000	-29.000000
50%	282.000000	18.600000	0.000000	-13.000000	4.000000	4.000000	-26.000000	-17.000000
75%	528.250000	25.600000	7.000000	-4.000000	6.000000	6.000000	-15.000000	-10.000000
max	3692.000000	34.400000	117.000000	61.000000	22.000000	22.000000	16.000000	9.000000

np_data_sevilla = data_sevilla.values

# Train dataset
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data_sevilla = scaler.fit_transform(np_data_sevilla)
print(f'Longitud del conjunto de datos disponible: {len(scaled_data_sevilla)}')

Longitud del conjunto de datos disponible: 654

# Since we are going to predict future values based on the 90 past elements, 
# we need to create a list with those historic information for each element
historic_values = 90
scaled_data_sevilla_x = []
scaled_data_sevilla_y = []

for i in range(historic_values, len(scaled_data_sevilla)):
    scaled_data_sevilla_x.append(scaled_data_sevilla[(i-historic_values):i, :])
    scaled_data_sevilla_y.append(scaled_data_sevilla[i, 0])

# Convert the x_train and y_train to numpy arrays
scaled_data_sevilla_x = np.array(scaled_data_sevilla_x)
scaled_data_sevilla_y = np.array(scaled_data_sevilla_y)

# Once predicted, we are going to need a exclusive scaler for num_cases
scaler_pred = MinMaxScaler(feature_range=(0, 1))
df_cases_sevilla = pd.DataFrame(data_sevilla['num_casos'])
scaled_data_sevilla_pred = scaler_pred.fit_transform(df_cases_sevilla)

# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_sevilla_y)}')

Longitud datos de entrenamiento con historico: 564

# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 90 days
x_train = scaled_data_sevilla_x[0:len(scaled_data_sevilla_x)-91]
y_train = scaled_data_sevilla_y[0:len(scaled_data_sevilla_y)-91]
print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')

x_test = scaled_data_sevilla_x[len(scaled_data_sevilla_x)-90:len(scaled_data_sevilla_x)]
y_test = scaled_data_sevilla_y[len(scaled_data_sevilla_y)-90:len(scaled_data_sevilla_y)]
print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')

Cantidad datos de entrenamiento: x=473 - y=473
Cantidad datos de test: x=90 - y=90

# Reshape the data to feed de recurrent network
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 8))
print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 8))
print("Test data shape:")
print(x_test.shape)
print(y_test.shape)

Train data shape:
(473, 90, 8)
(473,)
Test data shape:
(90, 90, 8)
(90,)

# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
model = Sequential()
# Model with Neurons 
# Inputshape = neurons -> Timestamps
neurons= x_train.shape[1]
model.add(LSTM(90, 
               activation = 'relu',
               return_sequences = True, 
               input_shape = (x_train.shape[1], 8))) 
model.add(LSTM(50, 
               activation = 'relu',
               return_sequences = True)) 
model.add(LSTM(25, 
               activation = 'relu',
               return_sequences = False)) 
model.add(Dense(5, activation = 'relu'))
model.add(Dense(1))

Build model...

model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()

Model: "sequential_4"

_________________________________________________________________

 Layer (type)                Output Shape              Param #

=================================================================

 lstm_12 (LSTM)              (None, 90, 90)            35640

 lstm_13 (LSTM)              (None, 90, 50)            28200

 lstm_14 (LSTM)              (None, 25)                7600

 dense_8 (Dense)             (None, 5)                 130

 dense_9 (Dense)             (None, 1)                 6

=================================================================

Total params: 71,576

Trainable params: 71,576

Non-trainable params: 0

_________________________________________________________________

# Training the model
# fit network
history = model.fit(x_train, 
                    y_train, 
                    batch_size=1000, 
                    epochs=30, 
                    validation_data = (x_test, y_test), 
                    verbose = 0)

plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()

# Get the predicted values
predictions = model.predict(x_test)
predictions = scaler_pred.inverse_transform(predictions)

WARNING:tensorflow:5 out of the last 13 calls to <function Model.make_predict_function.<locals>.predict_function at 0x7fd54fad4430> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has reduce_retracing=True option that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for  more details.

1/3 [=========>....................] - ETA: 0s

3/3 [==============================] - ETA: 0s

3/3 [==============================] - 0s 31ms/step

y_test = y_test.reshape(-1,1)
y_test = scaler_pred.inverse_transform(y_test)

# Calculate the mean absolute error (MAE)
mae = mean_absolute_error(y_test, predictions)
print('MAE: ' + str(round(mae, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test,predictions))
print('RMSE: ' + str(round(rmse, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = mean_squared_error(y_test, 
                          predictions,
                          squared = False)
print('RMSE: ' + str(round(rmse, 1)))

MAE: 1622.4
RMSE: 1889.7
RMSE: 1889.7

# Add the difference between the valid and predicted prices
train = data_sevilla[:(len(x_train)+92)]
valid = data_sevilla[(len(x_train)+91):]

valid.insert(1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True)

# Zoom-in to a closer timeframe
# Date from which on the date is displayed
display_start_date = "2021-10-31" 
valid = valid[valid.index > display_start_date]
train = train[train.index > display_start_date]

# Visualize the data
matplotlib.style.use('ggplot')
fig, ax1 = plt.subplots(figsize=(22, 10), sharex=True)

# Data - Train
xt = train.index; 
yt = train[["num_casos"]]
# Data - Test / validation 
xv = valid.index; 
yv = valid[["num_casos", "Predictions"]]

# Plot
plt.title("sevilla: Predictions vs Real infections", fontsize=20)
plt.ylabel("Nº Cases", fontsize=18)

plt.plot(yt, color="blue", linewidth=1.5)
plt.plot(yv["Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.legend(["Train", "LSTM Predictions", "Test"], 
           loc="upper left", fontsize=18)

# Bar plot with the differences
x = valid.index
y = valid["Difference"]
plt.bar(x, y, width=0.2, color="grey")
plt.grid()
plt.show()

Just before the sixth wave

Asturias

data_asturias = data_covid.loc[data_covid['provincia'] == 'Asturias']
data_asturias = data_asturias.set_index('fecha')
data_asturias = data_asturias['2020-06-14':'2021-12-31']
data_asturias = data_asturias.filter(['num_casos', 'tmed', 'mob_grocery_pharmacy', 
'mob_parks', 'mob_residential', 'mob_residential', 'mob_transit_stations', 'mob_workplaces'])
data_asturias

	num_casos	tmed	mob_grocery_pharmacy	mob_parks	mob_residential	mob_residential	mob_transit_stations	mob_workplaces
fecha
2020-06-14	0	16.8	-8.0	6.0	4.0	4.0	-43.0	-2.0
2020-06-15	0	16.0	-6.0	39.0	7.0	7.0	-38.0	-27.0
2020-06-16	1	15.6	-6.0	7.0	9.0	9.0	-43.0	-29.0
2020-06-17	0	15.6	-7.0	20.0	8.0	8.0	-42.0	-28.0
2020-06-18	0	15.1	-4.0	68.0	7.0	7.0	-39.0	-28.0
...	...	...	...	...	...	...	...	...
2021-12-27	2363	16.6	21.0	57.0	5.0	5.0	-9.0	-35.0
2021-12-28	2150	19.2	18.0	90.0	4.0	4.0	-6.0	-36.0
2021-12-29	2159	15.6	24.0	108.0	4.0	4.0	0.0	-34.0
2021-12-30	2020	13.2	39.0	88.0	4.0	4.0	-10.0	-36.0
2021-12-31	1949	14.6	21.0	46.0	10.0	10.0	-21.0	-54.0

566 rows × 8 columns

data_asturias.describe()

	num_casos	tmed	mob_grocery_pharmacy	mob_parks	mob_residential	mob_residential	mob_transit_stations	mob_workplaces
count	566.000000	566.000000	566.000000	566.000000	566.000000	566.000000	566.000000	566.000000
mean	180.913428	14.495671	4.159011	62.592756	3.798587	3.798587	-12.772085	-21.289753
std	276.250633	4.098429	15.207805	71.365461	5.051424	5.051424	15.668332	13.018468
min	0.000000	3.300000	-87.000000	-63.000000	-8.000000	-8.000000	-67.000000	-80.000000
25%	36.000000	11.525000	-0.375000	14.000000	0.000000	0.000000	-23.000000	-27.000000
50%	94.500000	15.000000	5.000000	43.000000	3.000000	3.000000	-13.000000	-20.000000
75%	225.750000	18.000000	11.000000	91.000000	7.000000	7.000000	-2.000000	-15.000000
max	2363.000000	25.100000	42.000000	333.000000	26.000000	26.000000	26.000000	12.000000

np_data_asturias = data_asturias.values

# Train dataset
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data_asturias = scaler.fit_transform(np_data_asturias)
print(f'Longitud del conjunto de datos disponible: {len(scaled_data_asturias)}')

Longitud del conjunto de datos disponible: 566

# Since we are going to predict future values based on the 90 past elements, 
# we need to create a list with those historic information for each element
historic_values = 90
scaled_data_asturias_x = []
scaled_data_asturias_y = []

for i in range(historic_values, len(scaled_data_asturias)):
    scaled_data_asturias_x.append(scaled_data_asturias[(i-historic_values):i, :])
    scaled_data_asturias_y.append(scaled_data_asturias[i, 0])

# Convert the x_train and y_train to numpy arrays
scaled_data_asturias_x = np.array(scaled_data_asturias_x)
scaled_data_asturias_y = np.array(scaled_data_asturias_y)

# Once predicted, we are going to need a exclusive scaler for num_cases
scaler_pred = MinMaxScaler(feature_range=(0, 1))
df_cases_asturias = pd.DataFrame(data_asturias['num_casos'])
scaled_data_asturias_pred = scaler_pred.fit_transform(df_cases_asturias)

# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_asturias_y)}')

Longitud datos de entrenamiento con historico: 476

# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 90 days
x_train = scaled_data_asturias_x[0:len(scaled_data_asturias_x)-91]
y_train = scaled_data_asturias_y[0:len(scaled_data_asturias_y)-91]
print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')

x_test = scaled_data_asturias_x[len(scaled_data_asturias_x)-90:len(scaled_data_asturias_x)]
y_test = scaled_data_asturias_y[len(scaled_data_asturias_y)-90:len(scaled_data_asturias_y)]
print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')

Cantidad datos de entrenamiento: x=385 - y=385
Cantidad datos de test: x=90 - y=90

# Reshape the data to feed de recurrent network
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 8))
print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 8))
print("Test data shape:")
print(x_test.shape)
print(y_test.shape)

Train data shape:
(385, 90, 8)
(385,)
Test data shape:
(90, 90, 8)
(90,)

# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
model = Sequential()
# Model with Neurons 
# Inputshape = neurons -> Timestamps
neurons= x_train.shape[1]
model.add(LSTM(90, 
               activation = 'relu',
               return_sequences = True, 
               input_shape = (x_train.shape[1], 8))) 
model.add(LSTM(50, 
               activation = 'relu',
               return_sequences = True)) 
model.add(LSTM(25, 
               activation = 'relu',
               return_sequences = False)) 
model.add(Dense(5, activation = 'relu'))
model.add(Dense(1))

Build model...

model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()

Model: "sequential_5"

_________________________________________________________________

 Layer (type)                Output Shape              Param #

=================================================================

 lstm_15 (LSTM)              (None, 90, 90)            35640

 lstm_16 (LSTM)              (None, 90, 50)            28200

 lstm_17 (LSTM)              (None, 25)                7600

 dense_10 (Dense)            (None, 5)                 130

 dense_11 (Dense)            (None, 1)                 6

=================================================================

Total params: 71,576

Trainable params: 71,576

Non-trainable params: 0

_________________________________________________________________

# Training the model
# fit network
history = model.fit(x_train, 
                    y_train, 
                    batch_size=1000, 
                    epochs=50, 
                    validation_data = (x_test, y_test), 
                    verbose = 0)

plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()

# Get the predicted values
predictions = model.predict(x_test)
predictions = scaler_pred.inverse_transform(predictions)

WARNING:tensorflow:5 out of the last 13 calls to <function Model.make_predict_function.<locals>.predict_function at 0x7fd53efb6310> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has reduce_retracing=True option that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for  more details.

1/3 [=========>....................] - ETA: 0s

3/3 [==============================] - ETA: 0s

3/3 [==============================] - 0s 31ms/step

y_test = y_test.reshape(-1,1)
y_test = scaler_pred.inverse_transform(y_test)

# Calculate the mean absolute error (MAE)
mae = mean_absolute_error(y_test, predictions)
print('MAE: ' + str(round(mae, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test,predictions))
print('RMSE: ' + str(round(rmse, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = mean_squared_error(y_test, 
                          predictions,
                          squared = False)
print('RMSE: ' + str(round(rmse, 1)))

MAE: 263.9
RMSE: 465.3
RMSE: 465.3

# Add the difference between the valid and predicted prices
train = data_asturias[:(len(x_train)+92)]
valid = data_asturias[(len(x_train)+91):]

valid.insert(1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True)

# Zoom-in to a closer timeframe
# Date from which on the date is displayed
display_start_date = "2021-07-15" 
valid = valid[valid.index > display_start_date]
train = train[train.index > display_start_date]

# Visualize the data
matplotlib.style.use('ggplot')
fig, ax1 = plt.subplots(figsize=(22, 10), sharex=True)

# Data - Train
xt = train.index; 
yt = train[["num_casos"]]
# Data - Test / validation 
xv = valid.index; 
yv = valid[["num_casos", "Predictions"]]

# Plot
plt.title("Asturias: Predictions vs Real infections", fontsize=20)
plt.ylabel("Nº Cases", fontsize=18)

plt.plot(yt, color="blue", linewidth=1.5)
plt.plot(yv["Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.legend(["Train", "LSTM Predictions", "Test"], 
           loc="upper left", fontsize=18)

# Bar plot with the differences
x = valid.index
y = valid["Difference"]
plt.bar(x, y, width=0.2, color="grey")
plt.grid()
plt.show()

Barcelona

data_barcelona = data_covid.loc[data_covid['provincia'] == 'Barcelona']
data_barcelona = data_barcelona.set_index('fecha')
data_barcelona = data_barcelona['2020-06-14':'2021-12-31']
data_barcelona = data_barcelona.filter(['num_casos', 'tmed', 'mob_grocery_pharmacy', 
'mob_parks', 'mob_residential', 'mob_residential', 'mob_transit_stations', 'mob_workplaces'])
data_barcelona

	num_casos	tmed	mob_grocery_pharmacy	mob_parks	mob_residential	mob_residential	mob_transit_stations	mob_workplaces
fecha
2020-06-14	33	20.6	-20.0	15.0	6.0	6.0	-31.0	-6.0
2020-06-15	62	21.2	-9.0	17.0	14.0	14.0	-34.0	-41.0
2020-06-16	66	20.8	-10.0	-16.0	16.0	16.0	-39.0	-41.0
2020-06-17	70	20.3	-4.0	14.0	15.0	15.0	-33.0	-40.0
2020-06-18	68	18.8	-8.0	-6.0	16.0	16.0	-38.0	-41.0
...	...	...	...	...	...	...	...	...
2021-12-27	19383	16.4	15.0	7.0	12.0	12.0	-25.0	-52.0
2021-12-28	20192	17.7	17.0	7.0	13.0	13.0	-25.0	-52.0
2021-12-29	19361	17.0	23.0	10.0	13.0	13.0	-25.0	-51.0
2021-12-30	17639	15.8	37.0	6.0	12.0	12.0	-24.0	-53.0
2021-12-31	16651	13.6	26.0	-12.0	16.0	16.0	-36.0	-60.0

566 rows × 8 columns

data_barcelona.describe()

	num_casos	tmed	mob_grocery_pharmacy	mob_parks	mob_residential	mob_residential	mob_transit_stations	mob_workplaces
count	566.000000	566.000000	566.000000	566.000000	566.000000	566.000000	566.000000	566.000000
mean	1575.480565	18.205477	-0.574205	-4.651943	7.353357	7.353357	-24.865724	-28.471731
std	2281.290383	6.086976	16.775476	15.728472	5.162377	5.162377	11.574598	16.238872
min	33.000000	4.300000	-83.000000	-76.000000	-6.000000	-6.000000	-72.000000	-87.000000
25%	544.750000	12.800000	-7.000000	-14.750000	4.000000	4.000000	-33.000000	-36.000000
50%	944.500000	17.800000	0.500000	-4.000000	8.000000	8.000000	-25.000000	-26.000000
75%	1609.500000	24.375000	10.000000	7.000000	10.000000	10.000000	-15.000000	-19.250000
max	20192.000000	28.400000	58.000000	33.000000	32.000000	32.000000	-2.000000	6.000000

np_data_barcelona = data_barcelona.values

# Train dataset
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data_barcelona = scaler.fit_transform(np_data_barcelona)
print(f'Longitud del conjunto de datos disponible: {len(scaled_data_barcelona)}')

Longitud del conjunto de datos disponible: 566

# Since we are going to predict future values based on the 90 past elements, 
# we need to create a list with those historic information for each element
historic_values = 90
scaled_data_barcelona_x = []
scaled_data_barcelona_y = []

for i in range(historic_values, len(scaled_data_barcelona)):
    scaled_data_barcelona_x.append(scaled_data_barcelona[(i-historic_values):i, :])
    scaled_data_barcelona_y.append(scaled_data_barcelona[i, 0])

# Convert the x_train and y_train to numpy arrays
scaled_data_barcelona_x = np.array(scaled_data_barcelona_x)
scaled_data_barcelona_y = np.array(scaled_data_barcelona_y)

# Once predicted, we are going to need a exclusive scaler for num_cases
scaler_pred = MinMaxScaler(feature_range=(0, 1))
df_cases_barcelona = pd.DataFrame(data_barcelona['num_casos'])
scaled_data_barcelona_pred = scaler_pred.fit_transform(df_cases_barcelona)

# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_barcelona_y)}')

Longitud datos de entrenamiento con historico: 476

# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 90 days
x_train = scaled_data_barcelona_x[0:len(scaled_data_barcelona_x)-91]
y_train = scaled_data_barcelona_y[0:len(scaled_data_barcelona_y)-91]
print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')

x_test = scaled_data_barcelona_x[len(scaled_data_barcelona_x)-90:len(scaled_data_barcelona_x)]
y_test = scaled_data_barcelona_y[len(scaled_data_barcelona_y)-90:len(scaled_data_barcelona_y)]
print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')

Cantidad datos de entrenamiento: x=385 - y=385
Cantidad datos de test: x=90 - y=90

# Reshape the data to feed de recurrent network
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 8))
print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 8))
print("Test data shape:")
print(x_test.shape)
print(y_test.shape)

Train data shape:
(385, 90, 8)
(385,)
Test data shape:
(90, 90, 8)
(90,)

# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
model = Sequential()
# Model with Neurons 
# Inputshape = neurons -> Timestamps
neurons= x_train.shape[1]
model.add(LSTM(90, 
               activation = 'relu',
               return_sequences = True, 
               input_shape = (x_train.shape[1], 8))) 
model.add(LSTM(50, 
               activation = 'relu',
               return_sequences = True)) 
model.add(LSTM(25, 
               activation = 'relu',
               return_sequences = False)) 
model.add(Dense(5, activation = 'relu'))
model.add(Dense(1))

Build model...

model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()

Model: "sequential_6"

_________________________________________________________________

 Layer (type)                Output Shape              Param #

=================================================================

 lstm_18 (LSTM)              (None, 90, 90)            35640

 lstm_19 (LSTM)              (None, 90, 50)            28200

 lstm_20 (LSTM)              (None, 25)                7600

 dense_12 (Dense)            (None, 5)                 130

 dense_13 (Dense)            (None, 1)                 6

=================================================================

Total params: 71,576

Trainable params: 71,576

Non-trainable params: 0

_________________________________________________________________

# Training the model
# fit network
history = model.fit(x_train, 
                    y_train, 
                    batch_size=1000, 
                    epochs=50, 
                    validation_data = (x_test, y_test), 
                    verbose = 0)

plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()

# Get the predicted values
predictions = model.predict(x_test)
predictions = scaler_pred.inverse_transform(predictions)

1/3 [=========>....................] - ETA: 0s

3/3 [==============================] - ETA: 0s

3/3 [==============================] - 0s 31ms/step

y_test = y_test.reshape(-1,1)
y_test = scaler_pred.inverse_transform(y_test)

# Calculate the mean absolute error (MAE)
mae = mean_absolute_error(y_test, predictions)
print('MAE: ' + str(round(mae, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test,predictions))
print('RMSE: ' + str(round(rmse, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = mean_squared_error(y_test, 
                          predictions,
                          squared = False)
print('RMSE: ' + str(round(rmse, 1)))

MAE: 2657.6
RMSE: 4156.6
RMSE: 4156.6

# Add the difference between the valid and predicted prices
train = data_barcelona[:(len(x_train)+92)]
valid = data_barcelona[(len(x_train)+91):]

valid.insert(1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True)

# Zoom-in to a closer timeframe
# Date from which on the date is displayed
display_start_date = "2021-07-15" 
valid = valid[valid.index > display_start_date]
train = train[train.index > display_start_date]

# Visualize the data
matplotlib.style.use('ggplot')
fig, ax1 = plt.subplots(figsize=(22, 10), sharex=True)

# Data - Train
xt = train.index; 
yt = train[["num_casos"]]
# Data - Test / validation 
xv = valid.index; 
yv = valid[["num_casos", "Predictions"]]

# Plot
plt.title("barcelona: Predictions vs Real infections", fontsize=20)
plt.ylabel("Nº Cases", fontsize=18)

plt.plot(yt, color="blue", linewidth=1.5)
plt.plot(yv["Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.legend(["Train", "LSTM Predictions", "Test"], 
           loc="upper left", fontsize=18)

# Bar plot with the differences
x = valid.index
y = valid["Difference"]
plt.bar(x, y, width=0.2, color="grey")
plt.grid()
plt.show()

Madrid

data_madrid = data_covid.loc[data_covid['provincia'] == 'Madrid']
data_madrid = data_madrid.set_index('fecha')
data_madrid = data_madrid['2020-06-14':'2021-12-31']
data_madrid = data_madrid.filter(['num_casos', 'tmed', 'mob_grocery_pharmacy', 
'mob_parks', 'mob_residential', 'mob_residential', 'mob_transit_stations', 'mob_workplaces'])
data_madrid

	num_casos	tmed	mob_grocery_pharmacy	mob_parks	mob_residential	mob_residential	mob_transit_stations	mob_workplaces
fecha
2020-06-14	81	19.0	-15.0	25.0	7.0	7.0	-43.0	-6.0
2020-06-15	153	20.7	-8.0	34.0	16.0	16.0	-43.0	-47.0
2020-06-16	91	21.0	-8.0	21.0	17.0	17.0	-43.0	-47.0
2020-06-17	93	20.2	-5.0	34.0	17.0	17.0	-42.0	-47.0
2020-06-18	85	21.9	-7.0	30.0	16.0	16.0	-44.0	-47.0
...	...	...	...	...	...	...	...	...
2021-12-27	22958	12.3	21.0	19.0	12.0	12.0	-28.0	-52.0
2021-12-28	23811	10.4	18.0	23.0	12.0	12.0	-30.0	-52.0
2021-12-29	21914	9.0	27.0	18.0	13.0	13.0	-30.0	-52.0
2021-12-30	20666	8.7	42.0	17.0	12.0	12.0	-30.0	-53.0
2021-12-31	7556	9.2	10.0	-15.0	18.0	18.0	-47.0	-69.0

566 rows × 8 columns

data_madrid.describe()

	num_casos	tmed	mob_grocery_pharmacy	mob_parks	mob_residential	mob_residential	mob_transit_stations	mob_workplaces
count	566.000000	566.000000	566.000000	566.000000	566.000000	566.000000	566.000000	566.000000
mean	1994.136042	16.004064	-3.793286	-4.438163	7.238516	7.238516	-31.560071	-28.715548
std	2795.419848	8.085617	14.945139	21.384112	5.850467	5.850467	12.292703	19.230654
min	28.000000	-6.200000	-84.000000	-62.000000	-6.000000	-6.000000	-73.000000	-86.000000
25%	550.250000	9.525000	-11.000000	-17.750000	4.000000	4.000000	-40.000000	-41.000000
50%	1312.500000	14.700000	-3.000000	-3.000000	7.000000	7.000000	-33.000000	-28.000000
75%	2282.500000	23.200000	6.000000	11.750000	11.000000	11.000000	-24.000000	-17.000000
max	23811.000000	32.700000	50.000000	77.000000	31.000000	31.000000	1.000000	20.000000

np_data_madrid = data_madrid.values

# Train dataset
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data_madrid = scaler.fit_transform(np_data_madrid)
print(f'Longitud del conjunto de datos disponible: {len(scaled_data_madrid)}')

Longitud del conjunto de datos disponible: 566

# Since we are going to predict future values based on the 90 past elements, 
# we need to create a list with those historic information for each element
historic_values = 90
scaled_data_madrid_x = []
scaled_data_madrid_y = []

for i in range(historic_values, len(scaled_data_madrid)):
    scaled_data_madrid_x.append(scaled_data_madrid[(i-historic_values):i, :])
    scaled_data_madrid_y.append(scaled_data_madrid[i, 0])

# Convert the x_train and y_train to numpy arrays
scaled_data_madrid_x = np.array(scaled_data_madrid_x)
scaled_data_madrid_y = np.array(scaled_data_madrid_y)

# Once predicted, we are going to need a exclusive scaler for num_cases
scaler_pred = MinMaxScaler(feature_range=(0, 1))
df_cases_madrid = pd.DataFrame(data_madrid['num_casos'])
scaled_data_madrid_pred = scaler_pred.fit_transform(df_cases_madrid)

# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_madrid_y)}')

Longitud datos de entrenamiento con historico: 476

# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 90 days
x_train = scaled_data_madrid_x[0:len(scaled_data_madrid_x)-91]
y_train = scaled_data_madrid_y[0:len(scaled_data_madrid_y)-91]
print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')

x_test = scaled_data_madrid_x[len(scaled_data_madrid_x)-90:len(scaled_data_madrid_x)]
y_test = scaled_data_madrid_y[len(scaled_data_madrid_y)-90:len(scaled_data_madrid_y)]
print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')

Cantidad datos de entrenamiento: x=385 - y=385
Cantidad datos de test: x=90 - y=90

# Reshape the data to feed de recurrent network
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 8))
print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 8))
print("Test data shape:")
print(x_test.shape)
print(y_test.shape)

Train data shape:
(385, 90, 8)
(385,)
Test data shape:
(90, 90, 8)
(90,)

# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
model = Sequential()
# Model with Neurons 
# Inputshape = neurons -> Timestamps
neurons= x_train.shape[1]
model.add(LSTM(90, 
               activation = 'relu',
               return_sequences = True, 
               input_shape = (x_train.shape[1], 8))) 
model.add(LSTM(50, 
               activation = 'relu',
               return_sequences = True)) 
model.add(LSTM(25, 
               activation = 'relu',
               return_sequences = False)) 
model.add(Dense(5, activation = 'relu'))
model.add(Dense(1))

Build model...

model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()

Model: "sequential_7"

_________________________________________________________________

 Layer (type)                Output Shape              Param #

=================================================================

 lstm_21 (LSTM)              (None, 90, 90)            35640

 lstm_22 (LSTM)              (None, 90, 50)            28200

 lstm_23 (LSTM)              (None, 25)                7600

 dense_14 (Dense)            (None, 5)                 130

 dense_15 (Dense)            (None, 1)                 6

=================================================================

Total params: 71,576

Trainable params: 71,576

Non-trainable params: 0

_________________________________________________________________

# Training the model
# fit network
history = model.fit(x_train, 
                    y_train, 
                    batch_size=1000, 
                    epochs=50, 
                    validation_data = (x_test, y_test), 
                    verbose = 0)

plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()

# Get the predicted values
predictions = model.predict(x_test)
predictions = scaler_pred.inverse_transform(predictions)

1/3 [=========>....................] - ETA: 0s

3/3 [==============================] - ETA: 0s

3/3 [==============================] - 0s 31ms/step

y_test = y_test.reshape(-1,1)
y_test = scaler_pred.inverse_transform(y_test)

# Calculate the mean absolute error (MAE)
mae = mean_absolute_error(y_test, predictions)
print('MAE: ' + str(round(mae, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test,predictions))
print('RMSE: ' + str(round(rmse, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = mean_squared_error(y_test, 
                          predictions,
                          squared = False)
print('RMSE: ' + str(round(rmse, 1)))

MAE: 2755.3
RMSE: 5897.2
RMSE: 5897.2

# Add the difference between the valid and predicted prices
train = data_madrid[:(len(x_train)+92)]
valid = data_madrid[(len(x_train)+91):]

valid.insert(1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True)

# Zoom-in to a closer timeframe
# Date from which on the date is displayed
display_start_date = "2021-07-15" 
valid = valid[valid.index > display_start_date]
train = train[train.index > display_start_date]

# Visualize the data
matplotlib.style.use('ggplot')
fig, ax1 = plt.subplots(figsize=(22, 10), sharex=True)

# Data - Train
xt = train.index; 
yt = train[["num_casos"]]
# Data - Test / validation 
xv = valid.index; 
yv = valid[["num_casos", "Predictions"]]

# Plot
plt.title("madrid: Predictions vs Real infections", fontsize=20)
plt.ylabel("Nº Cases", fontsize=18)

plt.plot(yt, color="blue", linewidth=1.5)
plt.plot(yv["Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.legend(["Train", "LSTM Predictions", "Test"], 
           loc="upper left", fontsize=18)

# Bar plot with the differences
x = valid.index
y = valid["Difference"]
plt.bar(x, y, width=0.2, color="grey")
plt.grid()
plt.show()

Málaga

data_malaga = data_covid.loc[data_covid['provincia'] == 'Málaga']
data_malaga = data_malaga.set_index('fecha')
data_malaga = data_malaga['2020-06-14':'2021-12-31']
data_malaga = data_malaga.filter(['num_casos', 'tmed', 'mob_grocery_pharmacy', 
'mob_parks', 'mob_residential', 'mob_residential', 'mob_transit_stations', 'mob_workplaces'])
data_malaga

	num_casos	tmed	mob_grocery_pharmacy	mob_parks	mob_residential	mob_residential	mob_transit_stations	mob_workplaces
fecha
2020-06-14	2	26.6	-20.0	-15.0	2.0	2.0	-53.0	-7.0
2020-06-15	1	27.2	-16.0	-9.0	9.0	9.0	-44.0	-33.0
2020-06-16	1	27.8	-13.0	-4.0	9.0	9.0	-40.0	-33.0
2020-06-17	2	26.9	-13.0	1.0	9.0	9.0	-41.0	-33.0
2020-06-18	2	21.6	-12.0	3.0	9.0	9.0	-42.0	-33.0
...	...	...	...	...	...	...	...	...
2021-12-27	1627	17.9	15.0	17.0	7.0	7.0	-12.0	-37.0
2021-12-28	2772	17.8	18.0	30.0	7.0	7.0	-7.0	-37.0
2021-12-29	3080	17.0	25.0	37.0	6.0	6.0	-8.0	-37.0
2021-12-30	3075	14.4	39.0	33.0	5.0	5.0	-8.0	-37.0
2021-12-31	2646	11.1	22.0	-5.0	12.0	12.0	-34.0	-50.0

566 rows × 8 columns

data_malaga.describe()

	num_casos	tmed	mob_grocery_pharmacy	mob_parks	mob_residential	mob_residential	mob_transit_stations	mob_workplaces
count	566.000000	566.000000	566.000000	566.000000	566.000000	566.000000	566.000000	566.000000
mean	339.227915	20.332332	8.427562	15.765018	4.588339	4.588339	-19.860424	-18.724382
std	427.261346	5.690349	27.032414	41.263233	4.227823	4.227823	19.025924	14.733563
min	1.000000	6.200000	-80.000000	-63.000000	-4.000000	-4.000000	-70.000000	-79.000000
25%	110.000000	15.600000	-5.000000	-14.000000	2.000000	2.000000	-34.000000	-27.000000
50%	193.500000	20.000000	4.000000	7.000000	4.000000	4.000000	-23.000000	-18.000000
75%	344.750000	25.075000	16.000000	36.000000	7.000000	7.000000	-4.000000	-12.000000
max	3080.000000	34.500000	156.000000	152.000000	22.000000	22.000000	20.000000	29.000000

np_data_malaga = data_malaga.values

# Train dataset
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data_malaga = scaler.fit_transform(np_data_malaga)
print(f'Longitud del conjunto de datos disponible: {len(scaled_data_malaga)}')

Longitud del conjunto de datos disponible: 566

# Since we are going to predict future values based on the 90 past elements, 
# we need to create a list with those historic information for each element
historic_values = 90
scaled_data_malaga_x = []
scaled_data_malaga_y = []

for i in range(historic_values, len(scaled_data_malaga)):
    scaled_data_malaga_x.append(scaled_data_malaga[(i-historic_values):i, :])
    scaled_data_malaga_y.append(scaled_data_malaga[i, 0])

# Convert the x_train and y_train to numpy arrays
scaled_data_malaga_x = np.array(scaled_data_malaga_x)
scaled_data_malaga_y = np.array(scaled_data_malaga_y)

# Once predicted, we are going to need a exclusive scaler for num_cases
scaler_pred = MinMaxScaler(feature_range=(0, 1))
df_cases_malaga = pd.DataFrame(data_malaga['num_casos'])
scaled_data_malaga_pred = scaler_pred.fit_transform(df_cases_malaga)

# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_malaga_y)}')

Longitud datos de entrenamiento con historico: 476

# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 90 days
x_train = scaled_data_malaga_x[0:len(scaled_data_malaga_x)-91]
y_train = scaled_data_malaga_y[0:len(scaled_data_malaga_y)-91]
print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')

x_test = scaled_data_malaga_x[len(scaled_data_malaga_x)-90:len(scaled_data_malaga_x)]
y_test = scaled_data_malaga_y[len(scaled_data_malaga_y)-90:len(scaled_data_malaga_y)]
print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')

Cantidad datos de entrenamiento: x=385 - y=385
Cantidad datos de test: x=90 - y=90

# Reshape the data to feed de recurrent network
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 8))
print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 8))
print("Test data shape:")
print(x_test.shape)
print(y_test.shape)

Train data shape:
(385, 90, 8)
(385,)
Test data shape:
(90, 90, 8)
(90,)

# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
model = Sequential()
# Model with Neurons 
# Inputshape = neurons -> Timestamps
neurons= x_train.shape[1]
model.add(LSTM(90, 
               activation = 'relu',
               return_sequences = True, 
               input_shape = (x_train.shape[1], 8))) 
model.add(LSTM(50, 
               activation = 'relu',
               return_sequences = True)) 
model.add(LSTM(25, 
               activation = 'relu',
               return_sequences = False)) 
model.add(Dense(5, activation = 'relu'))
model.add(Dense(1))

Build model...

model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()

Model: "sequential_8"

_________________________________________________________________

 Layer (type)                Output Shape              Param #

=================================================================

 lstm_24 (LSTM)              (None, 90, 90)            35640

 lstm_25 (LSTM)              (None, 90, 50)            28200

 lstm_26 (LSTM)              (None, 25)                7600

 dense_16 (Dense)            (None, 5)                 130

 dense_17 (Dense)            (None, 1)                 6

=================================================================

Total params: 71,576

Trainable params: 71,576

Non-trainable params: 0

_________________________________________________________________

# Training the model
# fit network
history = model.fit(x_train, 
                    y_train, 
                    batch_size=1000, 
                    epochs=50, 
                    validation_data = (x_test, y_test), 
                    verbose = 0)

plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()

# Get the predicted values
predictions = model.predict(x_test)
predictions = scaler_pred.inverse_transform(predictions)

1/3 [=========>....................] - ETA: 0s

3/3 [==============================] - ETA: 0s

3/3 [==============================] - 0s 31ms/step

y_test = y_test.reshape(-1,1)
y_test = scaler_pred.inverse_transform(y_test)

# Calculate the mean absolute error (MAE)
mae = mean_absolute_error(y_test, predictions)
print('MAE: ' + str(round(mae, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test,predictions))
print('RMSE: ' + str(round(rmse, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = mean_squared_error(y_test, 
                          predictions,
                          squared = False)
print('RMSE: ' + str(round(rmse, 1)))

MAE: 301.5
RMSE: 408.5
RMSE: 408.5

# Add the difference between the valid and predicted prices
train = data_malaga[:(len(x_train)+92)]
valid = data_malaga[(len(x_train)+91):]

valid.insert(1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True)

# Zoom-in to a closer timeframe
# Date from which on the date is displayed
display_start_date = "2021-07-15" 
valid = valid[valid.index > display_start_date]
train = train[train.index > display_start_date]

# Visualize the data
matplotlib.style.use('ggplot')
fig, ax1 = plt.subplots(figsize=(22, 10), sharex=True)

# Data - Train
xt = train.index; 
yt = train[["num_casos"]]
# Data - Test / validation 
xv = valid.index; 
yv = valid[["num_casos", "Predictions"]]

# Plot
plt.title("malaga: Predictions vs Real infections", fontsize=20)
plt.ylabel("Nº Cases", fontsize=18)

plt.plot(yt, color="blue", linewidth=1.5)
plt.plot(yv["Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.legend(["Train", "LSTM Predictions", "Test"], 
           loc="upper left", fontsize=18)

# Bar plot with the differences
x = valid.index
y = valid["Difference"]
plt.bar(x, y, width=0.2, color="grey")
plt.grid()
plt.show()

Sevilla

data_sevilla = data_covid.loc[data_covid['provincia'] == 'Sevilla']
data_sevilla = data_sevilla.set_index('fecha')
data_sevilla = data_sevilla['2020-06-14':'2021-12-31']
data_sevilla = data_sevilla.filter(['num_casos', 'tmed', 'mob_grocery_pharmacy', 
'mob_parks', 'mob_residential', 'mob_residential', 'mob_transit_stations', 'mob_workplaces'])
data_sevilla

	num_casos	tmed	mob_grocery_pharmacy	mob_parks	mob_residential	mob_residential	mob_transit_stations	mob_workplaces
fecha
2020-06-14	0	22.200000	-24.0	-36.0	0.0	0.0	-51.0	0.0
2020-06-15	2	23.088889	-14.0	-13.0	9.0	9.0	-41.0	-33.0
2020-06-16	1	23.977778	-10.0	-4.0	9.0	9.0	-37.0	-32.0
2020-06-17	0	24.866667	-10.0	-7.0	9.0	9.0	-37.0	-32.0
2020-06-18	0	25.755556	-12.0	-13.0	9.0	9.0	-41.0	-33.0
...	...	...	...	...	...	...	...	...
2021-12-27	2617	17.600000	16.0	-1.0	7.0	7.0	-19.0	-39.0
2021-12-28	3190	16.000000	17.0	15.0	7.0	7.0	-16.0	-37.0
2021-12-29	3692	14.100000	24.0	18.0	6.0	6.0	-18.0	-38.0
2021-12-30	3508	13.000000	39.0	11.0	6.0	6.0	-21.0	-39.0
2021-12-31	2816	14.600000	15.0	-24.0	12.0	12.0	-44.0	-53.0

566 rows × 8 columns

data_sevilla.describe()

	num_casos	tmed	mob_grocery_pharmacy	mob_parks	mob_residential	mob_residential	mob_transit_stations	mob_workplaces
count	566.000000	566.000000	566.000000	566.000000	566.000000	566.000000	566.000000	566.000000
mean	383.807420	20.691519	-1.908127	-16.176678	4.086572	4.086572	-27.310954	-21.436396
std	454.714408	6.882766	18.249947	18.068323	4.554635	4.554635	14.360299	14.353391
min	0.000000	5.500000	-79.000000	-70.000000	-7.000000	-7.000000	-71.000000	-79.000000
25%	112.750000	15.000000	-10.000000	-28.000000	1.000000	1.000000	-37.000000	-30.000000
50%	260.500000	20.800000	-1.000000	-13.000000	4.000000	4.000000	-28.000000	-18.000000
75%	459.000000	26.575000	6.000000	-5.000000	7.000000	7.000000	-18.000000	-11.000000
max	3692.000000	34.400000	113.000000	61.000000	22.000000	22.000000	16.000000	9.000000

np_data_sevilla = data_sevilla.values

# Train dataset
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data_sevilla = scaler.fit_transform(np_data_sevilla)
print(f'Longitud del conjunto de datos disponible: {len(scaled_data_sevilla)}')

Longitud del conjunto de datos disponible: 566

# Since we are going to predict future values based on the 90 past elements, 
# we need to create a list with those historic information for each element
historic_values = 90
scaled_data_sevilla_x = []
scaled_data_sevilla_y = []

for i in range(historic_values, len(scaled_data_sevilla)):
    scaled_data_sevilla_x.append(scaled_data_sevilla[(i-historic_values):i, :])
    scaled_data_sevilla_y.append(scaled_data_sevilla[i, 0])

# Convert the x_train and y_train to numpy arrays
scaled_data_sevilla_x = np.array(scaled_data_sevilla_x)
scaled_data_sevilla_y = np.array(scaled_data_sevilla_y)

# Once predicted, we are going to need a exclusive scaler for num_cases
scaler_pred = MinMaxScaler(feature_range=(0, 1))
df_cases_sevilla = pd.DataFrame(data_sevilla['num_casos'])
scaled_data_sevilla_pred = scaler_pred.fit_transform(df_cases_sevilla)

# Since the first 90th values does not have historic, the dataset has been reduced in 90 values
print(f'Longitud datos de entrenamiento con historico: {len(scaled_data_sevilla_y)}')

Longitud datos de entrenamiento con historico: 476

# we split data in train and test
# as in previous analysis, we are going to predict a maximum of 90 days
x_train = scaled_data_sevilla_x[0:len(scaled_data_sevilla_x)-91]
y_train = scaled_data_sevilla_y[0:len(scaled_data_sevilla_y)-91]
print(f'Cantidad datos de entrenamiento: x={len(x_train)} - y={len(y_train)}')

x_test = scaled_data_sevilla_x[len(scaled_data_sevilla_x)-90:len(scaled_data_sevilla_x)]
y_test = scaled_data_sevilla_y[len(scaled_data_sevilla_y)-90:len(scaled_data_sevilla_y)]
print(f'Cantidad datos de test: x={len(x_test)} - y={len(y_test)}')

Cantidad datos de entrenamiento: x=385 - y=385
Cantidad datos de test: x=90 - y=90

# Reshape the data to feed de recurrent network
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 8))
print("Train data shape:")
print(x_train.shape)
print(y_train.shape)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 8))
print("Test data shape:")
print(x_test.shape)
print(y_test.shape)

Train data shape:
(385, 90, 8)
(385,)
Test data shape:
(90, 90, 8)
(90,)

# Configure / setup the neural network model - LSTM
# Build the model
print('Build model...')
model = Sequential()
# Model with Neurons 
# Inputshape = neurons -> Timestamps
neurons= x_train.shape[1]
model.add(LSTM(90, 
               activation = 'relu',
               return_sequences = True, 
               input_shape = (x_train.shape[1], 8))) 
model.add(LSTM(50, 
               activation = 'relu',
               return_sequences = True)) 
model.add(LSTM(25, 
               activation = 'relu',
               return_sequences = False)) 
model.add(Dense(5, activation = 'relu'))
model.add(Dense(1))

Build model...

model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()

Model: "sequential_9"

_________________________________________________________________

 Layer (type)                Output Shape              Param #

=================================================================

 lstm_27 (LSTM)              (None, 90, 90)            35640

 lstm_28 (LSTM)              (None, 90, 50)            28200

 lstm_29 (LSTM)              (None, 25)                7600

 dense_18 (Dense)            (None, 5)                 130

 dense_19 (Dense)            (None, 1)                 6

=================================================================

Total params: 71,576

Trainable params: 71,576

Non-trainable params: 0

_________________________________________________________________

# Training the model
# fit network
history = model.fit(x_train, 
                    y_train, 
                    batch_size=1000, 
                    epochs=50, 
                    validation_data = (x_test, y_test), 
                    verbose = 0)

plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()

# Get the predicted values
predictions = model.predict(x_test)
predictions = scaler_pred.inverse_transform(predictions)

1/3 [=========>....................] - ETA: 0s

3/3 [==============================] - ETA: 0s

3/3 [==============================] - 0s 31ms/step

y_test = y_test.reshape(-1,1)
y_test = scaler_pred.inverse_transform(y_test)

# Calculate the mean absolute error (MAE)
mae = mean_absolute_error(y_test, predictions)
print('MAE: ' + str(round(mae, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test,predictions))
print('RMSE: ' + str(round(rmse, 1)))

# Calculate the root mean squarred error (RMSE)
rmse = mean_squared_error(y_test, 
                          predictions,
                          squared = False)
print('RMSE: ' + str(round(rmse, 1)))

MAE: 502.2
RMSE: 535.4
RMSE: 535.4

# Add the difference between the valid and predicted prices
train = data_sevilla[:(len(x_train)+92)]
valid = data_sevilla[(len(x_train)+91):]

valid.insert(1, "Predictions", predictions, True)
valid.insert(1, "Difference", valid["Predictions"] - valid["num_casos"], True)

# Zoom-in to a closer timeframe
# Date from which on the date is displayed
display_start_date = "2021-07-15" 
valid = valid[valid.index > display_start_date]
train = train[train.index > display_start_date]

# Visualize the data
matplotlib.style.use('ggplot')
fig, ax1 = plt.subplots(figsize=(22, 10), sharex=True)

# Data - Train
xt = train.index; 
yt = train[["num_casos"]]
# Data - Test / validation 
xv = valid.index; 
yv = valid[["num_casos", "Predictions"]]

# Plot
plt.title("sevilla: Predictions vs Real infections", fontsize=20)
plt.ylabel("Nº Cases", fontsize=18)

plt.plot(yt, color="blue", linewidth=1.5)
plt.plot(yv["Predictions"], color="red", linewidth=1.5)
plt.plot(yv["num_casos"], color="green", linewidth=1.5)
plt.legend(["Train", "LSTM Predictions", "Test"], 
           loc="upper left", fontsize=18)

# Bar plot with the differences
x = valid.index
y = valid["Difference"]
plt.bar(x, y, width=0.2, color="grey")
plt.grid()
plt.show()