Computing data values for the open age group needed to close a series • rddharmony

The function used to carry out this process is defined as follows:

dd_oag_compute(data, age_span = c(1, 5))¹

To show how this function works, we shall use the oag_compute_df dataset that is embedded on this package. This is an example of an abridged series so age_span == 5.

## Load the packages required
library(rddharmony)
library(kableExtra)
library(dplyr)
library(purrr)

## Create a function to be used to generate the table output
tab_output <- function(tab){
kable(tab, booktabs=TRUE, align = "c",table.envir = "capctable", longtable = TRUE) %>%
  kable_styling() %>%
  row_spec(0, bold = T, color = "white", background = "#6ebed8") %>% 
  kable_paper(html_font = "helvetica") %>%
  scroll_box(width = "100%", height = "300px")
  
}

oag_compute_df %>% tab_output()

AgeStart	AgeEnd	AgeLabel	AgeSpan	DataSourceYear	DataValue
10	15	10-14	5	NA	4
15	20	15-19	5	2017	1124
20	25	20-24	5	2017	14048
25	30	25-29	5	2017	36525
30	35	30-34	5	2017	38273
35	40	35-39	5	2017	20046
40	45	40-44	5	2017	4544
45	50	45-49	5	2017	278
50	0	50+	-1	2017	27
0	-1	Total	-1	2017	114870
-2	-2	Unknown	-2	2017	1

From the data, the open age needed to close this series is present (50+ with a data value of 27), so we will drop it for now and show how it can be re-calculated.

new_df <- oag_compute_df %>% 
  filter(!AgeLabel %in% grep("\\+", AgeLabel, value = TRUE, ignore.case = TRUE))

new_df %>% tab_output()

AgeStart	AgeEnd	AgeLabel	AgeSpan	DataSourceYear	DataValue
10	15	10-14	5	NA	4
15	20	15-19	5	2017	1124
20	25	20-24	5	2017	14048
25	30	25-29	5	2017	36525
30	35	30-34	5	2017	38273
35	40	35-39	5	2017	20046
40	45	40-44	5	2017	4544
45	50	45-49	5	2017	278
0	-1	Total	-1	2017	114870
-2	-2	Unknown	-2	2017	1

We start by defining the standard age groups using the std_age_function()² function.

std_ages <- std_age_function()

std_ages %>% tab_output()

AgeStart	AgeEnd	AgeLabel	AgeSpan	abridged	complete	AgeSort
0	1	0	1	TRUE	TRUE	1
1	5	1-4	4	TRUE	FALSE	2
0	5	0-4	5	TRUE	FALSE	3
5	10	5-9	5	TRUE	FALSE	4
10	15	10-14	5	TRUE	FALSE	5
15	20	15-19	5	TRUE	FALSE	6
20	25	20-24	5	TRUE	FALSE	7
25	30	25-29	5	TRUE	FALSE	8
30	35	30-34	5	TRUE	FALSE	9
35	40	35-39	5	TRUE	FALSE	10
40	45	40-44	5	TRUE	FALSE	11
45	50	45-49	5	TRUE	FALSE	12
50	55	50-54	5	TRUE	FALSE	13
55	60	55-59	5	TRUE	FALSE	14
60	65	60-64	5	TRUE	FALSE	15
65	70	65-69	5	TRUE	FALSE	16
70	75	70-74	5	TRUE	FALSE	17
75	80	75-79	5	TRUE	FALSE	18
80	85	80-84	5	TRUE	FALSE	19
85	90	85-89	5	TRUE	FALSE	20
90	95	90-94	5	TRUE	FALSE	21
95	100	95-99	5	TRUE	FALSE	22
100	105	100-104	5	TRUE	FALSE	23
105	110	105-109	5	TRUE	FALSE	24
110	115	110-114	5	TRUE	FALSE	25
115	120	115-119	5	TRUE	FALSE	26
120	125	120-124	5	TRUE	FALSE	27
125	130	125-129	5	TRUE	FALSE	28
1	2	1	1	FALSE	TRUE	29
2	3	2	1	FALSE	TRUE	30
3	4	3	1	FALSE	TRUE	31
4	5	4	1	FALSE	TRUE	32
5	6	5	1	FALSE	TRUE	33
6	7	6	1	FALSE	TRUE	34
7	8	7	1	FALSE	TRUE	35
8	9	8	1	FALSE	TRUE	36
9	10	9	1	FALSE	TRUE	37
10	11	10	1	FALSE	TRUE	38
11	12	11	1	FALSE	TRUE	39
12	13	12	1	FALSE	TRUE	40
13	14	13	1	FALSE	TRUE	41
14	15	14	1	FALSE	TRUE	42
15	16	15	1	FALSE	TRUE	43
16	17	16	1	FALSE	TRUE	44
17	18	17	1	FALSE	TRUE	45
18	19	18	1	FALSE	TRUE	46
19	20	19	1	FALSE	TRUE	47
20	21	20	1	FALSE	TRUE	48
21	22	21	1	FALSE	TRUE	49
22	23	22	1	FALSE	TRUE	50
23	24	23	1	FALSE	TRUE	51
24	25	24	1	FALSE	TRUE	52
25	26	25	1	FALSE	TRUE	53
26	27	26	1	FALSE	TRUE	54
27	28	27	1	FALSE	TRUE	55
28	29	28	1	FALSE	TRUE	56
29	30	29	1	FALSE	TRUE	57
30	31	30	1	FALSE	TRUE	58
31	32	31	1	FALSE	TRUE	59
32	33	32	1	FALSE	TRUE	60
33	34	33	1	FALSE	TRUE	61
34	35	34	1	FALSE	TRUE	62
35	36	35	1	FALSE	TRUE	63
36	37	36	1	FALSE	TRUE	64
37	38	37	1	FALSE	TRUE	65
38	39	38	1	FALSE	TRUE	66
39	40	39	1	FALSE	TRUE	67
40	41	40	1	FALSE	TRUE	68
41	42	41	1	FALSE	TRUE	69
42	43	42	1	FALSE	TRUE	70
43	44	43	1	FALSE	TRUE	71
44	45	44	1	FALSE	TRUE	72
45	46	45	1	FALSE	TRUE	73
46	47	46	1	FALSE	TRUE	74
47	48	47	1	FALSE	TRUE	75
48	49	48	1	FALSE	TRUE	76
49	50	49	1	FALSE	TRUE	77
50	51	50	1	FALSE	TRUE	78
51	52	51	1	FALSE	TRUE	79
52	53	52	1	FALSE	TRUE	80
53	54	53	1	FALSE	TRUE	81
54	55	54	1	FALSE	TRUE	82
55	56	55	1	FALSE	TRUE	83
56	57	56	1	FALSE	TRUE	84
57	58	57	1	FALSE	TRUE	85
58	59	58	1	FALSE	TRUE	86
59	60	59	1	FALSE	TRUE	87
60	61	60	1	FALSE	TRUE	88
61	62	61	1	FALSE	TRUE	89
62	63	62	1	FALSE	TRUE	90
63	64	63	1	FALSE	TRUE	91
64	65	64	1	FALSE	TRUE	92
65	66	65	1	FALSE	TRUE	93
66	67	66	1	FALSE	TRUE	94
67	68	67	1	FALSE	TRUE	95
68	69	68	1	FALSE	TRUE	96
69	70	69	1	FALSE	TRUE	97
70	71	70	1	FALSE	TRUE	98
71	72	71	1	FALSE	TRUE	99
72	73	72	1	FALSE	TRUE	100
73	74	73	1	FALSE	TRUE	101
74	75	74	1	FALSE	TRUE	102
75	76	75	1	FALSE	TRUE	103
76	77	76	1	FALSE	TRUE	104
77	78	77	1	FALSE	TRUE	105
78	79	78	1	FALSE	TRUE	106
79	80	79	1	FALSE	TRUE	107
80	81	80	1	FALSE	TRUE	108
81	82	81	1	FALSE	TRUE	109
82	83	82	1	FALSE	TRUE	110
83	84	83	1	FALSE	TRUE	111
84	85	84	1	FALSE	TRUE	112
85	86	85	1	FALSE	TRUE	113
86	87	86	1	FALSE	TRUE	114
87	88	87	1	FALSE	TRUE	115
88	89	88	1	FALSE	TRUE	116
89	90	89	1	FALSE	TRUE	117
90	91	90	1	FALSE	TRUE	118
91	92	91	1	FALSE	TRUE	119
92	93	92	1	FALSE	TRUE	120
93	94	93	1	FALSE	TRUE	121
94	95	94	1	FALSE	TRUE	122
95	96	95	1	FALSE	TRUE	123
96	97	96	1	FALSE	TRUE	124
97	98	97	1	FALSE	TRUE	125
98	99	98	1	FALSE	TRUE	126
99	100	99	1	FALSE	TRUE	127
100	101	100	1	FALSE	TRUE	128
101	102	101	1	FALSE	TRUE	129
102	103	102	1	FALSE	TRUE	130
103	104	103	1	FALSE	TRUE	131
104	105	104	1	FALSE	TRUE	132
105	106	105	1	FALSE	TRUE	133
106	107	106	1	FALSE	TRUE	134
107	108	107	1	FALSE	TRUE	135
108	109	108	1	FALSE	TRUE	136
109	110	109	1	FALSE	TRUE	137
110	111	110	1	FALSE	TRUE	138
111	112	111	1	FALSE	TRUE	139
112	113	112	1	FALSE	TRUE	140
113	114	113	1	FALSE	TRUE	141
114	115	114	1	FALSE	TRUE	142
115	116	115	1	FALSE	TRUE	143
116	117	116	1	FALSE	TRUE	144
117	118	117	1	FALSE	TRUE	145
118	119	118	1	FALSE	TRUE	146
119	120	119	1	FALSE	TRUE	147
120	121	120	1	FALSE	TRUE	148
121	122	121	1	FALSE	TRUE	149
122	123	122	1	FALSE	TRUE	150
123	124	123	1	FALSE	TRUE	151
124	125	124	1	FALSE	TRUE	152
125	126	125	1	FALSE	TRUE	153
126	127	126	1	FALSE	TRUE	154
127	128	127	1	FALSE	TRUE	155
128	129	128	1	FALSE	TRUE	156
129	130	129	1	FALSE	TRUE	157
5	0	5+	-1	TRUE	TRUE	158
10	0	10+	-1	TRUE	TRUE	159
15	0	15+	-1	TRUE	TRUE	160
20	0	20+	-1	TRUE	TRUE	161
25	0	25+	-1	TRUE	TRUE	162
30	0	30+	-1	TRUE	TRUE	163
35	0	35+	-1	TRUE	TRUE	164
40	0	40+	-1	TRUE	TRUE	165
45	0	45+	-1	TRUE	TRUE	166
50	0	50+	-1	TRUE	TRUE	167
55	0	55+	-1	TRUE	TRUE	168
60	0	60+	-1	TRUE	TRUE	169
65	0	65+	-1	TRUE	TRUE	170
70	0	70+	-1	TRUE	TRUE	171
75	0	75+	-1	TRUE	TRUE	172
80	0	80+	-1	TRUE	TRUE	173
85	0	85+	-1	TRUE	TRUE	174
90	0	90+	-1	TRUE	TRUE	175
95	0	95+	-1	TRUE	TRUE	176
100	0	100+	-1	TRUE	TRUE	177
105	0	105+	-1	TRUE	TRUE	178
110	0	110+	-1	TRUE	TRUE	179
115	0	115+	-1	TRUE	TRUE	180
120	0	120+	-1	TRUE	TRUE	181
125	0	125+	-1	TRUE	TRUE	182
130	0	130+	-1	TRUE	TRUE	183
0	-1	Total	-1	TRUE	TRUE	184
-2	-2	Unknown	-2	TRUE	TRUE	185

Since the data we are dealing with is an abridged series, we subset the standard age groups to only contain standard abridged age groups, and drop the variables that specify whether the records are abridged or complete.

age_std <- std_ages %>%
             filter(abridged == TRUE) %>%
             select(-abridged, -complete)

age_std %>% tab_output()

AgeStart	AgeEnd	AgeLabel	AgeSpan	AgeSort
0	1	0	1	1
1	5	1-4	4	2
0	5	0-4	5	3
5	10	5-9	5	4
10	15	10-14	5	5
15	20	15-19	5	6
20	25	20-24	5	7
25	30	25-29	5	8
30	35	30-34	5	9
35	40	35-39	5	10
40	45	40-44	5	11
45	50	45-49	5	12
50	55	50-54	5	13
55	60	55-59	5	14
60	65	60-64	5	15
65	70	65-69	5	16
70	75	70-74	5	17
75	80	75-79	5	18
80	85	80-84	5	19
85	90	85-89	5	20
90	95	90-94	5	21
95	100	95-99	5	22
100	105	100-104	5	23
105	110	105-109	5	24
110	115	110-114	5	25
115	120	115-119	5	26
120	125	120-124	5	27
125	130	125-129	5	28
5	0	5+	-1	158
10	0	10+	-1	159
15	0	15+	-1	160
20	0	20+	-1	161
25	0	25+	-1	162
30	0	30+	-1	163
35	0	35+	-1	164
40	0	40+	-1	165
45	0	45+	-1	166
50	0	50+	-1	167
55	0	55+	-1	168
60	0	60+	-1	169
65	0	65+	-1	170
70	0	70+	-1	171
75	0	75+	-1	172
80	0	80+	-1	173
85	0	85+	-1	174
90	0	90+	-1	175
95	0	95+	-1	176
100	0	100+	-1	177
105	0	105+	-1	178
110	0	110+	-1	179
115	0	115+	-1	180
120	0	120+	-1	181
125	0	125+	-1	182
130	0	130+	-1	183
0	-1	Total	-1	184
-2	-2	Unknown	-2	185

We then subset the data in question to remove any missing data values

age_span = 5
df <- new_df %>%
    dplyr::filter(!is.na(DataValue) & AgeSpan %in% c(age_span, -1, -2))

df %>% tab_output()

AgeStart	AgeEnd	AgeLabel	AgeSpan	DataSourceYear	DataValue
10	15	10-14	5	NA	4
15	20	15-19	5	2017	1124
20	25	20-24	5	2017	14048
25	30	25-29	5	2017	36525
30	35	30-34	5	2017	38273
35	40	35-39	5	2017	20046
40	45	40-44	5	2017	4544
45	50	45-49	5	2017	278
0	-1	Total	-1	2017	114870
-2	-2	Unknown	-2	2017	1

We later identify the start age of the open age group needed to close the series using the dd_oag_agestart()³ function

oag_start <- df %>% dd_oag_agestart
oag_start
#> [1] 50

From there, we list the possible standard open age groups that can be computed from the standard age groups.

  if (!is_empty(oag_start)) {
    age_std_open <- age_std %>%
      dplyr::filter(AgeSpan < 0 & AgeStart <= oag_start & !(AgeStart %in% c(-2,0)))
  } else {
    age_std_open <- NULL
  }
age_std_open %>% tab_output()

AgeStart	AgeLabel	AgeSpan	AgeSort
5	5+	-1	158
10	10+	-1	159
15	15+	-1	160
20	20+	-1	161
25	25+	-1	162
30	30+	-1	163
35	35+	-1	164
40	40+	-1	165
45	45+	-1	166
50	50+	-1	167

We proceed to look for an open age group record on input file that closes out the series. If an open age group already exists, the number of records for this output should be greater than 0 but since we already dropped the age group needed to close the series at the beginning for demonstration purposes, the result of this will be NULL.

oag_indata <- df %>%
      dplyr::filter(AgeLabel == paste0(oag_start,"+")) %>%
      select(AgeStart, AgeEnd, AgeSpan, AgeLabel, DataValue)
nrow(oag_indata)
#> [1] 0

## check if an open age group already exists
oag_check <- nrow(oag_indata) > 0
oag_check
#> [1] FALSE

We then check if there is a Total age label present in the data, and whether it is valid i.e is it an actual total or is it different from the sum of the data values of the age labels present in the data?

# check if there is a "Total" age label in the data
total_check <- "Total" %in% df$AgeLabel

# and whether it is valid
if (total_check) {
 total_value <- df$DataValue[df$AgeLabel=="Total"]
 total_value_valid <- total_value >= suppressWarnings(sum(df$DataValue[df$AgeSpan == age_span]))
} else 
{ total_value_valid <- FALSE 
}
total_value_valid
#> [1] TRUE

We also check if there is an unknown age label that exists in the data, and if it does, we extract its value, if it does not, we set the value to 0

unknown_check <- "Unknown" %in% df$AgeLabel
unknown_check
#> [1] TRUE
unknown_value <- ifelse(unknown_check, df$DataValue[df$AgeLabel=="Unknown"], 0)
unknown_value
#> [1] 1

In this case, an unknown exists and its data value is 1.

If an open age group record already exists, we use this to compute other open age groups.

    if (oag_check) {

      data.out <- NULL
      for (i in 1:nrow(age_std_open)) {
        df.out <- df %>%
          dplyr::filter((AgeStart >= age_std_open$AgeStart[i] & AgeSpan > 0) | AgeLabel==paste0(oag_start,"+")) %>%
          select(AgeStart,AgeEnd,AgeSpan,AgeLabel,DataValue) %>%
          summarise(DataValue = sum(DataValue)) %>%
          mutate(AgeStart = age_std_open$AgeStart[i],
                 AgeEnd   = age_std_open$AgeEnd[i],
                 AgeSpan  = age_std_open$AgeSpan[i],
                 AgeLabel = age_std_open$AgeLabel[i],
                 AgeSort  = age_std_open$AgeSort[i])
        data.out <- rbind(data.out,df.out)
      }

    }

But in our case, we know that the open age does not exist in the series since we already dropped it at the beginning, but there is a total, so we use this to compute the possible oag values.

    # if the needed open age group is not on the series, but there is a total, then use this to compute oag values
    if (!(oag_check) & total_check & total_value_valid) {

      data.out <- NULL
      for (i in 1:nrow(age_std_open)) {
        df.out <- df %>%
          dplyr::filter(AgeStart < age_std_open$AgeStart[i] & AgeSpan > 0) %>%
          select(AgeStart,AgeEnd,AgeSpan,AgeLabel,DataValue) %>%
          summarise(DataValue = total_value - unknown_value - sum(DataValue)) %>%
          mutate(AgeStart = age_std_open$AgeStart[i],
                 AgeEnd   = age_std_open$AgeEnd[i],
                 AgeSpan  = age_std_open$AgeSpan[i],
                 AgeLabel = age_std_open$AgeLabel[i],
                 AgeSort  = age_std_open$AgeSort[i])
        data.out <- rbind(data.out,df.out)
      }
    }

data.out %>% 
  relocate(DataValue , .after = last_col()) %>% 
  tab_output()

AgeStart	AgeSpan	AgeLabel	AgeSort	DataValue
5	-1	5+	158	114869
10	-1	10+	159	114869
15	-1	15+	160	114865
20	-1	20+	161	113741
25	-1	25+	162	99693
30	-1	30+	163	63168
35	-1	35+	164	24895
40	-1	40+	165	4849
45	-1	45+	166	305
50	-1	50+	167	27

As shown in DDharmonize_Vitals5 ⁴, the result of this function is integrated back into the data to end up with only one open age group for the series. This is done by picking the AgeLabel in the output above whose AgeStart corresponds to the open age group needed to close the series, in this case 50.

fin_df <- df %>%
        bind_rows(data.out[!(data.out$AgeLabel %in% df$AgeLabel) &
                              data.out$AgeStart == oag_start,]) %>%
       arrange(AgeSort)

fin_df %>% arrange(AgeLabel) %>%  tab_output()

AgeStart	AgeEnd	AgeLabel	AgeSpan	DataSourceYear	DataValue	AgeSort
10	15	10-14	5	NA	4	NA
15	20	15-19	5	2017	1124	NA
20	25	20-24	5	2017	14048	NA
25	30	25-29	5	2017	36525	NA
30	35	30-34	5	2017	38273	NA
35	40	35-39	5	2017	20046	NA
40	45	40-44	5	2017	4544	NA
45	50	45-49	5	2017	278	NA
50	0	50+	-1	NA	27	167
0	-1	Total	-1	2017	114870	NA
-2	-2	Unknown	-2	2017	1	NA

From the data above, the open age group needed to close the series is “50+” with a data value of 27, similar to the record that we dropped in the beginning.

NB: if neither an open age group nor a Total value exists, then we don’t compute open age group values.

AgeStart	AgeLabel	AgeSpan	AgeSort
5	5+	-1	158
10	10+	-1	159
15	15+	-1	160
20	20+	-1	161
25	25+	-1	162
30	30+	-1	163
35	35+	-1	164
40	40+	-1	165
45	45+	-1	166
50	50+	-1	167

AgeStart	AgeLabel	AgeSpan	AgeSort
5	5+	-1	158
10	10+	-1	159
15	15+	-1	160
20	20+	-1	161
25	25+	-1	162
30	30+	-1	163
35	35+	-1	164
40	40+	-1	165
45	45+	-1	166
50	50+	-1	167

AgeStart	AgeLabel	AgeSpan	AgeSort
5	5+	-1	158
10	10+	-1	159
15	15+	-1	160
20	20+	-1	161
25	25+	-1	162
30	30+	-1	163
35	35+	-1	164
40	40+	-1	165
45	45+	-1	166
50	50+	-1	167