# loading of packages required for the analyses
::p_load(
pacman# file locator
here, # data management and ggplot2 graphics
tidyverse, # get overview of data
skimr, # produce and adorn tabulations and cross-tabulations
janitor )
CNE data
Load packages:
CNE diagnostic technique
Number of cases by diagnostic technique and province of residence [link]:
- provincia_iso: ISO code of the province of residence.
- fecha: date. From pandemic onset to May 10, the date of symptom or, if not, the date of diagnosis/symptoms minus 6 days. From May 11, the date of symptom onset, or in the absence thereof, the date of diagnosis minus 3 days.
- num_casos: Number of cases by diagnostic technique and province of residence.
- num_casos_prueba_pcr: Number of cases with PCR laboratory test or molecular techniques.
- num_casos_prueba_test_ac: Number of cases with laboratory test of rapid antibody test.
- num_casos_prueba_ag: Number of cases with laboratory test of antigen detection test antigen test.
- num_casos_prueba_elisa: Number of cases with high-resolution serology laboratory testing.
- num_casos_prueba_desconocida: Number of cases without information on laboratory testing.
<- read_csv(
cne_tecnica file = here("data", "raw", "casos_tecnica_provincia.csv"),
show_col_types = FALSE
) cne_tecnica
# A tibble: 43,407 × 8
provincia_iso fecha num_casos num_casos_prueba_pcr num_casos_prueba_tes…
<chr> <date> <dbl> <dbl> <dbl>
1 A 2020-01-01 0 0 0
2 AB 2020-01-01 0 0 0
3 AL 2020-01-01 0 0 0
4 AV 2020-01-01 0 0 0
5 B 2020-01-01 0 0 0
6 BA 2020-01-01 0 0 0
7 BI 2020-01-01 0 0 0
8 BU 2020-01-01 0 0 0
9 C 2020-01-01 0 0 0
10 CA 2020-01-01 0 0 0
# … with 43,397 more rows, and 3 more variables: num_casos_prueba_ag <dbl>,
# num_casos_prueba_elisa <dbl>, num_casos_prueba_desconocida <dbl>
Check data distribution and missing data.
skim(cne_tecnica)
Name | cne_tecnica |
Number of rows | 43407 |
Number of columns | 8 |
_______________________ | |
Column type frequency: | |
character | 1 |
Date | 1 |
numeric | 6 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
provincia_iso | 819 | 0.98 | 1 | 2 | 0 | 52 | 0 |
Variable type: Date
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
fecha | 0 | 1 | 2020-01-01 | 2022-03-29 | 2021-02-13 | 819 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
num_casos | 0 | 1 | 265.67 | 973.38 | 0 | 11 | 58 | 184 | 34701 | ▇▁▁▁▁ |
num_casos_prueba_pcr | 0 | 1 | 122.34 | 328.97 | 0 | 7 | 36 | 109 | 10167 | ▇▁▁▁▁ |
num_casos_prueba_test_ac | 0 | 1 | 0.11 | 0.97 | 0 | 0 | 0 | 0 | 32 | ▇▁▁▁▁ |
num_casos_prueba_ag | 0 | 1 | 128.28 | 715.58 | 0 | 0 | 8 | 50 | 31073 | ▇▁▁▁▁ |
num_casos_prueba_elisa | 0 | 1 | 0.11 | 1.12 | 0 | 0 | 0 | 0 | 55 | ▇▁▁▁▁ |
num_casos_prueba_desconocida | 0 | 1 | 14.83 | 101.87 | 0 | 0 | 0 | 0 | 3418 | ▇▁▁▁▁ |
%>%
cne_tecnica tabyl(provincia_iso) %>%
adorn_pct_formatting() %>%
adorn_title(row_name = "Province")
NA
Province n percent valid_percent
A 819 1.9% 1.9%
AB 819 1.9% 1.9%
AL 819 1.9% 1.9%
AV 819 1.9% 1.9%
B 819 1.9% 1.9%
BA 819 1.9% 1.9%
BI 819 1.9% 1.9%
BU 819 1.9% 1.9%
C 819 1.9% 1.9%
CA 819 1.9% 1.9%
CC 819 1.9% 1.9%
CE 819 1.9% 1.9%
CO 819 1.9% 1.9%
CR 819 1.9% 1.9%
CS 819 1.9% 1.9%
CU 819 1.9% 1.9%
GC 819 1.9% 1.9%
GI 819 1.9% 1.9%
GR 819 1.9% 1.9%
GU 819 1.9% 1.9%
H 819 1.9% 1.9%
HU 819 1.9% 1.9%
J 819 1.9% 1.9%
L 819 1.9% 1.9%
LE 819 1.9% 1.9%
LO 819 1.9% 1.9%
LU 819 1.9% 1.9%
M 819 1.9% 1.9%
MA 819 1.9% 1.9%
ML 819 1.9% 1.9%
MU 819 1.9% 1.9%
NC 819 1.9% 1.9%
O 819 1.9% 1.9%
OR 819 1.9% 1.9%
P 819 1.9% 1.9%
PM 819 1.9% 1.9%
PO 819 1.9% 1.9%
S 819 1.9% 1.9%
SA 819 1.9% 1.9%
SE 819 1.9% 1.9%
SG 819 1.9% 1.9%
SO 819 1.9% 1.9%
SS 819 1.9% 1.9%
T 819 1.9% 1.9%
TE 819 1.9% 1.9%
TF 819 1.9% 1.9%
TO 819 1.9% 1.9%
V 819 1.9% 1.9%
VA 819 1.9% 1.9%
VI 819 1.9% 1.9%
Z 819 1.9% 1.9%
ZA 819 1.9% 1.9%
<NA> 819 1.9% -
Cleaning NAs
<- cne_tecnica %>%
cne_tecnica drop_na(provincia_iso)
# Show data information
skim(cne_tecnica)
Name | cne_tecnica |
Number of rows | 42588 |
Number of columns | 8 |
_______________________ | |
Column type frequency: | |
character | 1 |
Date | 1 |
numeric | 6 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
provincia_iso | 0 | 1 | 1 | 2 | 0 | 52 | 0 |
Variable type: Date
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
fecha | 0 | 1 | 2020-01-01 | 2022-03-29 | 2021-02-13 | 819 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
num_casos | 0 | 1 | 265.36 | 979.13 | 0 | 11 | 57 | 182 | 34701 | ▇▁▁▁▁ |
num_casos_prueba_pcr | 0 | 1 | 122.65 | 331.70 | 0 | 7 | 35 | 108 | 10167 | ▇▁▁▁▁ |
num_casos_prueba_test_ac | 0 | 1 | 0.11 | 0.98 | 0 | 0 | 0 | 0 | 32 | ▇▁▁▁▁ |
num_casos_prueba_ag | 0 | 1 | 127.55 | 718.45 | 0 | 0 | 8 | 49 | 31073 | ▇▁▁▁▁ |
num_casos_prueba_elisa | 0 | 1 | 0.12 | 1.13 | 0 | 0 | 0 | 0 | 55 | ▇▁▁▁▁ |
num_casos_prueba_desconocida | 0 | 1 | 14.93 | 102.69 | 0 | 0 | 0 | 0 | 3418 | ▇▁▁▁▁ |
CNE hospitalizations
Number of cases, hospitalisations, admissions to ICU and deaths by sex, age and province of residence [link]:
- provincia_iso:ISO code of the province of residence.
- fecha: date. For cases the date of diagnosis is used and for hospitalisations, ICU admissions and deaths the hospitalised cases are represented by date of hospitalisation (failing this, date of diagnosis, and if not, date of death).
- sexo: sex of cases: H (male), M (female), NC (not stated)
- grupo_edad: age group to which the case belongs: 0-9, 10-19, 20-29, 30-39, 40-49, 50-59, 60-69, 70-79, ≥80 years. NC: not stated
- num_casos: number of reported cases confirmed as having a positive diagnostic test for active infection (PDIA) as infection (PDIA) as set out in the Early Detection Strategy, surveillance and control strategy for COVID-19 and in addition cases notified before 11-May that required hospitalisation, ICU admission or required hospitalisation, ICU admission or died with a clinical diagnosis of COVID19, according to the case definitions in force at the time.
- num_hosp: number of hospitalised cases
- num_uci: number of cases admitted to ICU
- num_def: number of deaths
<- read_csv(
cne_hosp_data file = here("data", "raw", "casos_hosp_uci_def_sexo_edad_provres.csv"),
show_col_types = FALSE
) cne_hosp_data
# A tibble: 1,302,210 × 8
provincia_iso sexo grupo_edad fecha num_casos num_hosp num_uci num_def
<chr> <chr> <chr> <date> <dbl> <dbl> <dbl> <dbl>
1 A H 0-9 2020-01-01 0 0 0 0
2 A H 10-19 2020-01-01 0 0 0 0
3 A H 20-29 2020-01-01 0 0 0 0
4 A H 30-39 2020-01-01 0 0 0 0
5 A H 40-49 2020-01-01 0 0 0 0
6 A H 50-59 2020-01-01 0 0 0 0
7 A H 60-69 2020-01-01 0 0 0 0
8 A H 70-79 2020-01-01 0 0 0 0
9 A H 80+ 2020-01-01 0 0 0 0
10 A H NC 2020-01-01 0 0 0 0
# … with 1,302,200 more rows
Check data distribution and missing data.
skim(cne_hosp_data)
Name | cne_hosp_data |
Number of rows | 1302210 |
Number of columns | 8 |
_______________________ | |
Column type frequency: | |
character | 3 |
Date | 1 |
numeric | 4 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
provincia_iso | 24570 | 0.98 | 1 | 2 | 0 | 52 | 0 |
sexo | 0 | 1.00 | 1 | 2 | 0 | 3 | 0 |
grupo_edad | 0 | 1.00 | 2 | 5 | 0 | 10 | 0 |
Variable type: Date
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
fecha | 0 | 1 | 2020-01-01 | 2022-03-29 | 2021-02-13 | 819 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
num_casos | 0 | 1 | 8.86 | 50.52 | 0 | 0 | 0 | 4 | 3741 | ▇▁▁▁▁ |
num_hosp | 0 | 1 | 0.39 | 2.49 | 0 | 0 | 0 | 0 | 270 | ▇▁▁▁▁ |
num_uci | 0 | 1 | 0.04 | 0.31 | 0 | 0 | 0 | 0 | 35 | ▇▁▁▁▁ |
num_def | 0 | 1 | 0.08 | 0.80 | 0 | 0 | 0 | 0 | 100 | ▇▁▁▁▁ |
%>%
cne_hosp_data tabyl(provincia_iso, sexo) %>%
adorn_totals(where = "col") %>%
adorn_percentages(denominator = "col") %>%
adorn_pct_formatting() %>%
adorn_ns(position = "front") %>%
adorn_title(
row_name = "Province",
col_name = "Gender",
placement = "combined")
Province/Gender H M NC Total
A 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
AB 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
AL 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
AV 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
B 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
BA 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
BI 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
BU 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
C 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
CA 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
CC 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
CE 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
CO 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
CR 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
CS 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
CU 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
GC 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
GI 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
GR 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
GU 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
H 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
HU 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
J 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
L 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
LE 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
LO 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
LU 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
M 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
MA 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
ML 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
MU 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
NC 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
O 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
OR 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
P 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
PM 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
PO 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
S 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
SA 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
SE 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
SG 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
SO 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
SS 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
T 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
TE 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
TF 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
TO 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
V 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
VA 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
VI 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
Z 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
ZA 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
<NA> 8190 (1.9%) 8190 (1.9%) 8190 (1.9%) 24570 (1.9%)
Cleaning NAs
<- cne_hosp_data %>%
cne_hosp_data drop_na(provincia_iso)
skim(cne_hosp_data)
Name | cne_hosp_data |
Number of rows | 1277640 |
Number of columns | 8 |
_______________________ | |
Column type frequency: | |
character | 3 |
Date | 1 |
numeric | 4 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
provincia_iso | 0 | 1 | 1 | 2 | 0 | 52 | 0 |
sexo | 0 | 1 | 1 | 2 | 0 | 3 | 0 |
grupo_edad | 0 | 1 | 2 | 5 | 0 | 10 | 0 |
Variable type: Date
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
fecha | 0 | 1 | 2020-01-01 | 2022-03-29 | 2021-02-13 | 819 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
num_casos | 0 | 1 | 8.85 | 50.82 | 0 | 0 | 0 | 4 | 3741 | ▇▁▁▁▁ |
num_hosp | 0 | 1 | 0.39 | 2.51 | 0 | 0 | 0 | 0 | 270 | ▇▁▁▁▁ |
num_uci | 0 | 1 | 0.04 | 0.31 | 0 | 0 | 0 | 0 | 35 | ▇▁▁▁▁ |
num_def | 0 | 1 | 0.08 | 0.80 | 0 | 0 | 0 | 0 | 100 | ▇▁▁▁▁ |
Combination
We proceed to combine both datasets.
Before the combination it is neccesary to group the demographic data from the hospitalisation dataframe.
<- cne_hosp_data %>%
cne_hosp_data_grouped group_by(provincia_iso, fecha) %>%
summarise_at(vars(num_casos, num_hosp, num_uci, num_def), sum)
Data combination:
<- inner_join(cne_tecnica,
cne_data
cne_hosp_data_grouped, by = c("provincia_iso", "fecha"))
cne_data
# A tibble: 42,588 × 12
provincia_iso fecha num_casos.x num_casos_prueba_pcr num_casos_prueba_t…
<chr> <date> <dbl> <dbl> <dbl>
1 A 2020-01-01 0 0 0
2 AB 2020-01-01 0 0 0
3 AL 2020-01-01 0 0 0
4 AV 2020-01-01 0 0 0
5 B 2020-01-01 0 0 0
6 BA 2020-01-01 0 0 0
7 BI 2020-01-01 0 0 0
8 BU 2020-01-01 0 0 0
9 C 2020-01-01 0 0 0
10 CA 2020-01-01 0 0 0
# … with 42,578 more rows, and 7 more variables: num_casos_prueba_ag <dbl>,
# num_casos_prueba_elisa <dbl>, num_casos_prueba_desconocida <dbl>,
# num_casos.y <dbl>, num_hosp <dbl>, num_uci <dbl>, num_def <dbl>
The total number of cases aggregated from hospitalisation data (‘num_casos.y’) should be the same as the data extracted from the detection tecnica.
str_glue("Is the aggregated data equal? {sum(cne_data$num_casos.x) == sum(cne_data$num_casos.y)}")
Is the aggregated data equal? TRUE
str_glue("Summatory from tecnica data: {sum(cne_data$num_casos.x)}")
Summatory from tecnica data: 11301030
str_glue("Summatory from hospitalisation data: {sum(cne_data$num_casos.y)}")
Summatory from hospitalisation data: 11301030
Hence it is possible to remove one of them.
<- cne_data %>%
cne_data rename(num_casos = num_casos.x) %>%
select(-num_casos.y) # Remove num_casos from hospitalisation data)
Result:
cne_data
# A tibble: 42,588 × 11
provincia_iso fecha num_casos num_casos_prueba_pcr num_casos_prueba_tes…
<chr> <date> <dbl> <dbl> <dbl>
1 A 2020-01-01 0 0 0
2 AB 2020-01-01 0 0 0
3 AL 2020-01-01 0 0 0
4 AV 2020-01-01 0 0 0
5 B 2020-01-01 0 0 0
6 BA 2020-01-01 0 0 0
7 BI 2020-01-01 0 0 0
8 BU 2020-01-01 0 0 0
9 C 2020-01-01 0 0 0
10 CA 2020-01-01 0 0 0
# … with 42,578 more rows, and 6 more variables: num_casos_prueba_ag <dbl>,
# num_casos_prueba_elisa <dbl>, num_casos_prueba_desconocida <dbl>,
# num_hosp <dbl>, num_uci <dbl>, num_def <dbl>
The data contains information from 2020-01-01 to 2022-03-29.