Introduction to samplingin
Population DataFrame
We’ll use the dataset pop_dt
. The dataset contains tabulation of Indonesia’s population based on the results of the 2020 population census by regency/city and gender from BPS-Statistics Indonesia https://sensus.bps.go.id/main/index/sp2020.
dim(pop_dt)
#> [1] 514 8
pop_dt %>% head()
#> idkab kdprov kdkab nmprov nmkab Laki-laki Perempuan Total
#> 1 1101 11 01 ACEH SIMEULUE 47630 45235 92865
#> 2 1102 11 02 ACEH ACEH SINGKIL 63978 62536 126514
#> 3 1103 11 03 ACEH ACEH SELATAN 116542 115872 232414
#> 4 1104 11 04 ACEH ACEH TENGGARA 110799 110061 220860
#> 5 1105 11 05 ACEH ACEH TIMUR 212286 210115 422401
#> 6 1106 11 06 ACEH ACEH TENGAH 109262 106314 215576
Allocation DataFrame
The dataset used is alokasi_dt
which is a dataset consisting of sample allocations for each province for sampling purposes.
dim(alokasi_dt)
#> [1] 34 3
alokasi_dt
#> # A tibble: 34 x 3
#> kdprov jml_kabkota n_primary
#> <chr> <int> <dbl>
#> 1 11 23 4
#> 2 12 33 5
#> 3 13 19 3
#> 4 14 12 3
#> # ... with 30 more rows
Simple Random Sampling (SRS)
A simple random sample is a randomly selected subset of a population. In this sampling method, each member of the population has an exactly equal chance of being selected.
The following is the syntax for simple random sampling. Use parameter method = 'srs'
dtSampling_srs = doSampling(
pop = pop_dt,
alloc = alokasi_dt,
nsample = "n_primary",
seed = 7891,
method = "srs",
ident = c("kdprov"),
type = "U"
)
#> no implicit stratification variable chosen, sort by: kdprov
#> Negative allocation: 0
#> Zero allocation: 0
#> kdprov 11
#> kdprov 12
#> kdprov 13
#> kdprov 14
#> kdprov 15
#> kdprov 16
#> kdprov 17
#> kdprov 18
#> kdprov 19
#> kdprov 21
#> kdprov 31
#> kdprov 32
#> kdprov 33
#> kdprov 34
#> kdprov 35
#> kdprov 36
#> kdprov 51
#> kdprov 52
#> kdprov 53
#> kdprov 61
#> kdprov 62
#> kdprov 63
#> kdprov 64
#> kdprov 65
#> kdprov 71
#> kdprov 72
#> kdprov 73
#> kdprov 74
#> kdprov 75
#> kdprov 76
#> kdprov 81
#> kdprov 82
#> kdprov 91
#> kdprov 94
#> Joining with `by = join_by(kdprov, tmp_strata)`
#> All allocations have been selected. Selected 100 out of 100 (100%)
Displaying the primary sampling result
Population Sampled
head(dtSampling_srs$pop)
#> idkab kdprov kdkab nmprov nmkab Laki-laki Perempuan Total flags
#> 1 1101 11 01 ACEH SIMEULUE 47630 45235 92865 U
#> 2 1102 11 02 ACEH ACEH SINGKIL 63978 62536 126514 <NA>
#> 3 1103 11 03 ACEH ACEH SELATAN 116542 115872 232414 <NA>
#> 4 1104 11 04 ACEH ACEH TENGGARA 110799 110061 220860 <NA>
#> 5 1105 11 05 ACEH ACEH TIMUR 212286 210115 422401 <NA>
#> 6 1106 11 06 ACEH ACEH TENGAH 109262 106314 215576 <NA>
#> tanggal
#> 1 28/09/24
#> 2 <NA>
#> 3 <NA>
#> 4 <NA>
#> 5 <NA>
#> 6 <NA>
Units Sampled
head(dtSampling_srs$sampledf)
#> idkab kdprov kdkab nmprov nmkab Laki-laki Perempuan Total flags
#> 1 1101 11 01 ACEH SIMEULUE 47630 45235 92865 U
#> 2 1118 11 18 ACEH PIDIE JAYA 78742 79655 158397 U
#> 3 1171 11 71 ACEH BANDA ACEH 127435 125464 252899 U
#> 4 1172 11 72 ACEH SABANG 20838 20359 41197 U
#> 5 1201 12 01 SUMATERA UTARA NIAS 71686 74986 146672 U
#> 6 1224 12 24 SUMATERA UTARA NIAS UTARA 73216 74058 147274 U
#> tanggal
#> 1 28/09/24
#> 2 28/09/24
#> 3 28/09/24
#> 4 28/09/24
#> 5 28/09/24
#> 6 28/09/24
dtSampling_srs$sampledf %>% nrow
#> [1] 100
Sampling Details
head(dtSampling_srs$details)
#> kdprov jml_kabkota n_primary tmp_strata npop n_deficit n_selected
#> 1 11 23 4 1 23 0 4
#> 2 12 33 5 1 33 0 5
#> 3 13 19 3 1 19 0 3
#> 4 14 12 3 1 12 0 3
#> 5 15 11 3 1 11 0 3
#> 6 16 17 3 1 17 0 3
Systematic Random Sampling
Systematic random sampling is a method to select samples at a particular preset interval. Using population and allocation data that has been provided previously, we will carry out systematic random sampling by utilizing the doSampling
function from samplingin
package. Use parameter method = 'systematic'
Primary Units Sampling
The following is the syntax for sampling the primary units
dtSampling_u = doSampling(
pop = pop_dt,
alloc = alokasi_dt,
nsample = "n_primary",
seed = 2,
method = "systematic",
ident = c("kdprov"),
type = "U"
)
#> no implicit stratification variable chosen, sort by: kdprov
#> Negative allocation: 0
#> Zero allocation: 0
#> kdprov 11
#> kdprov 12
#> kdprov 13
#> kdprov 14
#> kdprov 15
#> kdprov 16
#> kdprov 17
#> kdprov 18
#> kdprov 19
#> kdprov 21
#> kdprov 31
#> kdprov 32
#> kdprov 33
#> kdprov 34
#> kdprov 35
#> kdprov 36
#> kdprov 51
#> kdprov 52
#> kdprov 53
#> kdprov 61
#> kdprov 62
#> kdprov 63
#> kdprov 64
#> kdprov 65
#> kdprov 71
#> kdprov 72
#> kdprov 73
#> kdprov 74
#> kdprov 75
#> kdprov 76
#> kdprov 81
#> kdprov 82
#> kdprov 91
#> kdprov 94
#> Joining with `by = join_by(kdprov, tmp_strata)`
#> All allocations have been selected. Selected 100 out of 100 (100%)
Displaying the primary sampling result
Population Sampled
head(dtSampling_u$pop)
#> idkab kdprov kdkab nmprov nmkab Laki-laki Perempuan Total flags
#> 1 1101 11 01 ACEH SIMEULUE 47630 45235 92865 U
#> 2 1102 11 02 ACEH ACEH SINGKIL 63978 62536 126514 <NA>
#> 3 1103 11 03 ACEH ACEH SELATAN 116542 115872 232414 <NA>
#> 4 1104 11 04 ACEH ACEH TENGGARA 110799 110061 220860 <NA>
#> 5 1105 11 05 ACEH ACEH TIMUR 212286 210115 422401 <NA>
#> 6 1106 11 06 ACEH ACEH TENGAH 109262 106314 215576 <NA>
#> tanggal
#> 1 28/09/24
#> 2 <NA>
#> 3 <NA>
#> 4 <NA>
#> 5 <NA>
#> 6 <NA>
Units Sampled
head(dtSampling_u$sampledf)
#> idkab kdprov kdkab nmprov nmkab Laki-laki Perempuan Total
#> 1 1101 11 01 ACEH SIMEULUE 47630 45235 92865
#> 2 1107 11 07 ACEH ACEH BARAT 100492 98244 198736
#> 3 1113 11 13 ACEH GAYO LUES 50026 49506 99532
#> 4 1118 11 18 ACEH PIDIE JAYA 78742 79655 158397
#> 5 1205 12 05 SUMATERA UTARA TAPANULI UTARA 156176 156582 312758
#> 6 1211 12 11 SUMATERA UTARA KARO 200247 204751 404998
#> flags tanggal
#> 1 U 28/09/24
#> 2 U 28/09/24
#> 3 U 28/09/24
#> 4 U 28/09/24
#> 5 U 28/09/24
#> 6 U 28/09/24
dtSampling_u$sampledf %>% nrow
#> [1] 100
Sampling Details
head(dtSampling_u$details)
#> kdprov jml_kabkota n_primary tmp_strata npop ar k n_deficit
#> 1 11 23 4 1 23 0.1848823 5.750000 0
#> 2 12 33 5 1 33 0.7023740 6.600000 0
#> 3 13 19 3 1 19 0.5733263 6.333333 0
#> 4 14 12 3 1 12 0.1680519 4.000000 0
#> 5 15 11 3 1 11 0.9438393 3.666667 0
#> 6 16 17 3 1 17 0.9434750 5.666667 0
#> n_selected
#> 1 4
#> 2 5
#> 3 3
#> 4 3
#> 5 3
#> 6 3
Secondary Units Sampling
To perform sampling for secondary units, we utilize the population results from prior sampling, which have been marked for the selected primary units. Parameters in doSampling
are added with is_secondary=TRUE
.
alokasi_dt_p = alokasi_dt %>%
mutate(n_secondary = 2*n_primary)
dtSampling_p = doSampling(
pop = dtSampling_u$pop,
alloc = alokasi_dt_p,
nsample = "n_secondary",
seed = 243,
method = "systematic",
ident = c("kdprov"),
type = "P",
is_secondary = TRUE
)
#> no implicit stratification variable chosen, sort by: kdprov
#> Negative allocation: 0
#> Zero allocation: 0
#> Sampling Secondary Units
#> kdprov 11
#> kdprov 12
#> kdprov 13
#> kdprov 14
#> kdprov 15
#> kdprov 16
#> kdprov 17
#> kdprov 18
#> kdprov 19
#> kdprov 21
#> kdprov 31
#> kdprov 32
#> kdprov 33
#> kdprov 34
#> kdprov 35
#> kdprov 36
#> kdprov 51
#> kdprov 52
#> kdprov 53
#> kdprov 61
#> kdprov 62
#> kdprov 63
#> kdprov 64
#> kdprov 65
#> kdprov 71
#> kdprov 72
#> kdprov 73
#> kdprov 74
#> kdprov 75
#> kdprov 76
#> kdprov 81
#> kdprov 82
#> kdprov 91
#> kdprov 94
#> Joining with `by = join_by(kdprov, tmp_strata, is_secondary_tmp)`
#> WARNING: There are still 2 allocations for which samples have not been selected. Selected 198 out of 200 (99%)
It can be seen that there are still 2 units that have not been selected as samples. To view the allocation that has not yet been selected as samples, it is as follows:
dtSampling_p$details %>%
filter(n_deficit>0)
#> kdprov jml_kabkota n_primary n_secondary tmp_strata npop ar k
#> 1 34 5 2 4 1 3 0.05382431 0.75
#> 2 65 5 2 4 1 3 0.42914161 0.75
#> n_deficit n_selected
#> 1 1 3
#> 2 1 3
Displaying the secondary sampling result
Population Sampled
head(dtSampling_p$pop)
#> idkab kdprov kdkab nmprov nmkab Laki-laki Perempuan Total flags
#> 1 1101 11 01 ACEH SIMEULUE 47630 45235 92865 U
#> 2 1102 11 02 ACEH ACEH SINGKIL 63978 62536 126514 P
#> 3 1103 11 03 ACEH ACEH SELATAN 116542 115872 232414 <NA>
#> 4 1104 11 04 ACEH ACEH TENGGARA 110799 110061 220860 P
#> 5 1105 11 05 ACEH ACEH TIMUR 212286 210115 422401 <NA>
#> 6 1106 11 06 ACEH ACEH TENGAH 109262 106314 215576 <NA>
#> tanggal
#> 1 28/09/24
#> 2 28/09/24
#> 3 <NA>
#> 4 28/09/24
#> 5 <NA>
#> 6 <NA>
Flags for primary and secondary units
dtSampling_p$pop %>% count(flags)
#> flags n
#> 1 P 198
#> 2 U 100
#> 3 <NA> 216
Units Sampled
head(dtSampling_p$sampledf)
#> idkab kdprov kdkab nmprov nmkab Laki-laki Perempuan Total flags
#> 1 1102 11 02 ACEH ACEH SINGKIL 63978 62536 126514 P
#> 2 1104 11 04 ACEH ACEH TENGGARA 110799 110061 220860 P
#> 3 1108 11 08 ACEH ACEH BESAR 204428 201107 405535 P
#> 4 1110 11 10 ACEH BIREUEN 215282 221136 436418 P
#> 5 1112 11 12 ACEH ACEH BARAT DAYA 76254 74521 150775 P
#> 6 1116 11 16 ACEH ACEH JAYA 47264 45895 93159 P
#> tanggal
#> 1 28/09/24
#> 2 28/09/24
#> 3 28/09/24
#> 4 28/09/24
#> 5 28/09/24
#> 6 28/09/24
dtSampling_p$sampledf %>% nrow
#> [1] 198
Sampling Details
head(dtSampling_p$details)
#> kdprov jml_kabkota n_primary n_secondary tmp_strata npop ar k
#> 1 11 23 4 8 1 19 0.3700528 2.375000
#> 2 12 33 5 10 1 28 0.4498103 2.800000
#> 3 13 19 3 6 1 16 0.2502315 2.666667
#> 4 14 12 3 6 1 9 0.6223435 1.500000
#> 5 15 11 3 6 1 8 0.7630308 1.333333
#> 6 16 17 3 6 1 14 0.7818968 2.333333
#> n_deficit n_selected
#> 1 0 8
#> 2 0 10
#> 3 0 6
#> 4 0 6
#> 5 0 6
#> 6 0 6
PPS Systematic Sampling
PPS systematic sampling is a method of sampling from a finite population in which a size measure is available for each population unit before sampling and where the probability of selecting a unit is proportional to its size. Units with larger sizes have more chance to be selected. We will use doSampling
function with parameter method = 'pps'
and auxVar = 'Total'
for its auxiliary variable.
dtSampling_pps = doSampling(
pop = pop_dt,
alloc = alokasi_dt,
nsample = "n_primary",
seed = 321,
method = "pps",
auxVar = "Total",
ident = c("kdprov"),
type = "U"
)
#> no implicit stratification variable chosen, sort by: kdprov
#> Negative allocation: 0
#> Zero allocation: 0
#> Joining with `by = join_by(kdprov, tmp_strata)`
#> kdprov 11
#> kdprov 12
#> kdprov 13
#> kdprov 14
#> kdprov 15
#> kdprov 16
#> kdprov 17
#> kdprov 18
#> kdprov 19
#> kdprov 21
#> kdprov 31
#> kdprov 32
#> kdprov 33
#> kdprov 34
#> kdprov 35
#> kdprov 36
#> kdprov 51
#> kdprov 52
#> kdprov 53
#> kdprov 61
#> kdprov 62
#> kdprov 63
#> kdprov 64
#> kdprov 65
#> kdprov 71
#> kdprov 72
#> kdprov 73
#> kdprov 74
#> kdprov 75
#> kdprov 76
#> kdprov 81
#> kdprov 82
#> kdprov 91
#> kdprov 94
#> Joining with `by = join_by(kdprov, tmp_strata)`
#> All allocations have been selected. Selected 100 out of 100 (100%)
Displaying the PPS sampling result
Population Sampled
head(dtSampling_pps$pop)
#> idkab kdprov kdkab nmprov nmkab Laki-laki Perempuan Total flags
#> 1 1101 11 01 ACEH SIMEULUE 47630 45235 92865 <NA>
#> 2 1102 11 02 ACEH ACEH SINGKIL 63978 62536 126514 <NA>
#> 3 1103 11 03 ACEH ACEH SELATAN 116542 115872 232414 <NA>
#> 4 1104 11 04 ACEH ACEH TENGGARA 110799 110061 220860 <NA>
#> 5 1105 11 05 ACEH ACEH TIMUR 212286 210115 422401 <NA>
#> 6 1106 11 06 ACEH ACEH TENGAH 109262 106314 215576 U
#> tanggal
#> 1 <NA>
#> 2 <NA>
#> 3 <NA>
#> 4 <NA>
#> 5 <NA>
#> 6 28/09/24
Units Sampled
head(dtSampling_pps$sampledf)
#> idkab kdprov kdkab nmprov nmkab Laki-laki Perempuan Total
#> 1 1106 11 06 ACEH ACEH TENGAH 109262 106314 215576
#> 2 1110 11 10 ACEH BIREUEN 215282 221136 436418
#> 3 1114 11 14 ACEH ACEH TAMIANG 149263 145093 294356
#> 4 1175 11 75 ACEH SUBULUSSALAM 46065 44686 90751
#> 5 1208 12 08 SUMATERA UTARA ASAHAN 389391 380569 769960
#> 6 1212 12 12 SUMATERA UTARA DELI SERDANG 971735 959706 1931441
#> flags tanggal
#> 1 U 28/09/24
#> 2 U 28/09/24
#> 3 U 28/09/24
#> 4 U 28/09/24
#> 5 U 28/09/24
#> 6 U 28/09/24
dtSampling_pps$sampledf %>% nrow
#> [1] 100
Sampling Details
head(dtSampling_pps$details)
#> kdprov jml_kabkota n_primary tmp_strata npop ar k n_deficit
#> 1 11 23 4 1 23 0.9558938 5.750000 0
#> 2 12 33 5 1 33 0.9372855 6.600000 0
#> 3 13 19 3 1 19 0.2382205 6.333333 0
#> 4 14 12 3 1 12 0.2550736 4.000000 0
#> 5 15 11 3 1 11 0.3905120 3.666667 0
#> 6 16 17 3 1 17 0.3411799 5.666667 0
#> n_selected
#> 1 4
#> 2 5
#> 3 3
#> 4 3
#> 5 3
#> 6 3
Sampling using Stratification
For sampling that utilizes stratification, the doSampling
function includes additional parameter called strata
. The strata variable must be available in the population and the allocation being used. For example, in the pop_dt
data, information about strata
is added, namely strata_kabkot
, which indicates information about districts (strata_kabkot = 1) and cities (strata_kabkot = 2).
pop_dt_strata = pop_dt %>%
mutate(
strata_kabkot = ifelse(substr(kdkab,1,1)=='7', 2, 1)
)
alokasi_dt_strata = pop_dt_strata %>%
group_by(kdprov,strata_kabkot) %>%
summarise(
jml_kabkota = n()
) %>%
ungroup %>%
left_join(
alokasi_dt %>%
select(kdprov,n_primary) %>%
rename(n_alloc = n_primary)
)
#> `summarise()` has grouped output by 'kdprov'. You can override using the
#> `.groups` argument.
#> Joining with `by = join_by(kdprov)`
alokasi_dt_strata = alokasi_dt_strata %>%
get_allocation(n_alloc = "n_alloc", group = c("kdprov"), pop_var = "jml_kabkota")
dtSampling_strata = doSampling(
pop = pop_dt_strata,
alloc = alokasi_dt_strata,
nsample = "n_primary",
seed = 3512,
method = "systematic",
strata = "strata_kabkot",
ident = c("kdprov"),
type = "U"
)
#> no implicit stratification variable chosen, sort by: kdprov and strata_kabkot
#>
#> Negative allocation: 0
#>
#> Zero allocation: 0
#> kdprov 11 strata_kabkot 1
#> kdprov 11 strata_kabkot 2
#> kdprov 12 strata_kabkot 1
#> kdprov 12 strata_kabkot 2
#> kdprov 13 strata_kabkot 1
#> kdprov 13 strata_kabkot 2
#> kdprov 14 strata_kabkot 1
#> kdprov 14 strata_kabkot 2
#> kdprov 15 strata_kabkot 1
#> kdprov 15 strata_kabkot 2
#> kdprov 16 strata_kabkot 1
#> kdprov 16 strata_kabkot 2
#> kdprov 17 strata_kabkot 1
#> kdprov 17 strata_kabkot 2
#> kdprov 18 strata_kabkot 1
#> kdprov 18 strata_kabkot 2
#> kdprov 19 strata_kabkot 1
#> kdprov 19 strata_kabkot 2
#> kdprov 21 strata_kabkot 1
#> kdprov 21 strata_kabkot 2
#> kdprov 31 strata_kabkot 1
#> kdprov 31 strata_kabkot 2
#> kdprov 32 strata_kabkot 1
#> kdprov 32 strata_kabkot 2
#> kdprov 33 strata_kabkot 1
#> kdprov 33 strata_kabkot 2
#> kdprov 34 strata_kabkot 1
#> kdprov 34 strata_kabkot 2
#> kdprov 35 strata_kabkot 1
#> kdprov 35 strata_kabkot 2
#> kdprov 36 strata_kabkot 1
#> kdprov 36 strata_kabkot 2
#> kdprov 51 strata_kabkot 1
#> kdprov 51 strata_kabkot 2
#> kdprov 52 strata_kabkot 1
#> kdprov 52 strata_kabkot 2
#> kdprov 53 strata_kabkot 1
#> kdprov 53 strata_kabkot 2
#> kdprov 61 strata_kabkot 1
#> kdprov 61 strata_kabkot 2
#> kdprov 62 strata_kabkot 1
#> kdprov 62 strata_kabkot 2
#> kdprov 63 strata_kabkot 1
#> kdprov 63 strata_kabkot 2
#> kdprov 64 strata_kabkot 1
#> kdprov 64 strata_kabkot 2
#> kdprov 65 strata_kabkot 1
#> kdprov 65 strata_kabkot 2
#> kdprov 71 strata_kabkot 1
#> kdprov 71 strata_kabkot 2
#> kdprov 72 strata_kabkot 1
#> kdprov 72 strata_kabkot 2
#> kdprov 73 strata_kabkot 1
#> kdprov 73 strata_kabkot 2
#> kdprov 74 strata_kabkot 1
#> kdprov 74 strata_kabkot 2
#> kdprov 75 strata_kabkot 1
#> kdprov 75 strata_kabkot 2
#> kdprov 76 strata_kabkot 1
#> kdprov 81 strata_kabkot 1
#> kdprov 81 strata_kabkot 2
#> kdprov 82 strata_kabkot 1
#> kdprov 82 strata_kabkot 2
#> kdprov 91 strata_kabkot 1
#> kdprov 91 strata_kabkot 2
#> kdprov 94 strata_kabkot 1
#> kdprov 94 strata_kabkot 2
#> Joining with `by = join_by(kdprov, strata_kabkot)`
#> All allocations have been selected. Selected 100 out of 100 (100%)
Displaying the sampling result with stratification
Population Sampled
head(dtSampling_strata$pop)
#> idkab kdprov kdkab nmprov nmkab Laki-laki Perempuan Total
#> 1 1101 11 01 ACEH SIMEULUE 47630 45235 92865
#> 2 1102 11 02 ACEH ACEH SINGKIL 63978 62536 126514
#> 3 1103 11 03 ACEH ACEH SELATAN 116542 115872 232414
#> 4 1104 11 04 ACEH ACEH TENGGARA 110799 110061 220860
#> 5 1105 11 05 ACEH ACEH TIMUR 212286 210115 422401
#> 6 1106 11 06 ACEH ACEH TENGAH 109262 106314 215576
#> strata_kabkot flags tanggal
#> 1 1 <NA> <NA>
#> 2 1 <NA> <NA>
#> 3 1 U 28/09/24
#> 4 1 <NA> <NA>
#> 5 1 <NA> <NA>
#> 6 1 <NA> <NA>
Units Sampled
head(dtSampling_strata$sampledf)
#> idkab kdprov kdkab nmprov nmkab Laki-laki Perempuan Total
#> 1 1103 11 03 ACEH ACEH SELATAN 116542 115872 232414
#> 2 1109 11 09 ACEH PIDIE 215878 219397 435275
#> 3 1115 11 15 ACEH NAGAN RAYA 85039 83353 168392
#> 4 1171 11 71 ACEH BANDA ACEH 127435 125464 252899
#> 5 1204 12 04 SUMATERA UTARA TAPANULI TENGAH 183814 181363 365177
#> 6 1212 12 12 SUMATERA UTARA DELI SERDANG 971735 959706 1931441
#> strata_kabkot flags tanggal
#> 1 1 U 28/09/24
#> 2 1 U 28/09/24
#> 3 1 U 28/09/24
#> 4 2 U 28/09/24
#> 5 1 U 28/09/24
#> 6 1 U 28/09/24
dtSampling_strata$sampledf %>% nrow
#> [1] 100
dtSampling_strata$sampledf %>% count(strata_kabkot)
#> strata_kabkot n
#> 1 1 63
#> 2 2 37
Sampling Details
head(dtSampling_strata$details)
#> kdprov strata_kabkot jml_kabkota n_alloc n_primary npop ar k
#> 1 11 1 18 4 3 18 0.4643545 6.000000
#> 2 11 2 5 4 1 5 0.2667689 5.000000
#> 3 12 1 25 5 3 25 0.4806872 8.333333
#> 4 12 2 8 5 2 8 0.4282740 4.000000
#> 5 13 1 12 3 2 12 0.7073768 6.000000
#> 6 13 2 7 3 1 7 0.1943529 7.000000
#> n_deficit n_selected
#> 1 0 3
#> 2 0 1
#> 3 0 3
#> 4 0 2
#> 5 0 2
#> 6 0 1
Sampling with Implicit Stratification
So that the characteristics of the selected sample are distributed according to certain variables, sampling sometimes employs implicit stratification. For instance, if you aim to obtain samples distributed according to the total population, you can add the parameter implicitby = 'Total'
when conducting sampling.
dtSampling_implicit = doSampling(
pop = pop_dt_strata,
alloc = alokasi_dt_strata,
nsample = "n_primary",
seed = 3512,
method = "systematic",
strata = "strata_kabkot",
implicitby = "Total",
ident = c("kdprov"),
type = "U"
)
#> sort by: kdprov, strata_kabkot and Total
#> Negative allocation: 0
#> Zero allocation: 0
#> kdprov 11 strata_kabkot 1
#> kdprov 11 strata_kabkot 2
#> kdprov 12 strata_kabkot 1
#> kdprov 12 strata_kabkot 2
#> kdprov 13 strata_kabkot 1
#> kdprov 13 strata_kabkot 2
#> kdprov 14 strata_kabkot 1
#> kdprov 14 strata_kabkot 2
#> kdprov 15 strata_kabkot 1
#> kdprov 15 strata_kabkot 2
#> kdprov 16 strata_kabkot 1
#> kdprov 16 strata_kabkot 2
#> kdprov 17 strata_kabkot 1
#> kdprov 17 strata_kabkot 2
#> kdprov 18 strata_kabkot 1
#> kdprov 18 strata_kabkot 2
#> kdprov 19 strata_kabkot 1
#> kdprov 19 strata_kabkot 2
#> kdprov 21 strata_kabkot 1
#> kdprov 21 strata_kabkot 2
#> kdprov 31 strata_kabkot 1
#> kdprov 31 strata_kabkot 2
#> kdprov 32 strata_kabkot 1
#> kdprov 32 strata_kabkot 2
#> kdprov 33 strata_kabkot 1
#> kdprov 33 strata_kabkot 2
#> kdprov 34 strata_kabkot 1
#> kdprov 34 strata_kabkot 2
#> kdprov 35 strata_kabkot 1
#> kdprov 35 strata_kabkot 2
#> kdprov 36 strata_kabkot 1
#> kdprov 36 strata_kabkot 2
#> kdprov 51 strata_kabkot 1
#> kdprov 51 strata_kabkot 2
#> kdprov 52 strata_kabkot 1
#> kdprov 52 strata_kabkot 2
#> kdprov 53 strata_kabkot 1
#> kdprov 53 strata_kabkot 2
#> kdprov 61 strata_kabkot 1
#> kdprov 61 strata_kabkot 2
#> kdprov 62 strata_kabkot 1
#> kdprov 62 strata_kabkot 2
#> kdprov 63 strata_kabkot 1
#> kdprov 63 strata_kabkot 2
#> kdprov 64 strata_kabkot 1
#> kdprov 64 strata_kabkot 2
#> kdprov 65 strata_kabkot 1
#> kdprov 65 strata_kabkot 2
#> kdprov 71 strata_kabkot 1
#> kdprov 71 strata_kabkot 2
#> kdprov 72 strata_kabkot 1
#> kdprov 72 strata_kabkot 2
#> kdprov 73 strata_kabkot 1
#> kdprov 73 strata_kabkot 2
#> kdprov 74 strata_kabkot 1
#> kdprov 74 strata_kabkot 2
#> kdprov 75 strata_kabkot 1
#> kdprov 75 strata_kabkot 2
#> kdprov 76 strata_kabkot 1
#> kdprov 81 strata_kabkot 1
#> kdprov 81 strata_kabkot 2
#> kdprov 82 strata_kabkot 1
#> kdprov 82 strata_kabkot 2
#> kdprov 91 strata_kabkot 1
#> kdprov 91 strata_kabkot 2
#> kdprov 94 strata_kabkot 1
#> kdprov 94 strata_kabkot 2
#> Joining with `by = join_by(kdprov, strata_kabkot)`
#> All allocations have been selected. Selected 100 out of 100 (100%)
Displaying the sampling result with implicit stratification
Population Sampled
head(dtSampling_implicit$pop)
#> idkab kdprov kdkab nmprov nmkab Laki-laki Perempuan Total
#> 1 1101 11 01 ACEH SIMEULUE 47630 45235 92865
#> 2 1116 11 16 ACEH ACEH JAYA 47264 45895 93159
#> 3 1113 11 13 ACEH GAYO LUES 50026 49506 99532
#> 4 1102 11 02 ACEH ACEH SINGKIL 63978 62536 126514
#> 5 1112 11 12 ACEH ACEH BARAT DAYA 76254 74521 150775
#> 6 1118 11 18 ACEH PIDIE JAYA 78742 79655 158397
#> strata_kabkot flags tanggal
#> 1 1 <NA> <NA>
#> 2 1 <NA> <NA>
#> 3 1 U 28/09/24
#> 4 1 <NA> <NA>
#> 5 1 <NA> <NA>
#> 6 1 <NA> <NA>
Units Sampled
head(dtSampling_implicit$sampledf)
#> idkab kdprov kdkab nmprov nmkab Laki-laki Perempuan Total
#> 1 1113 11 13 ACEH GAYO LUES 50026 49506 99532
#> 2 1107 11 07 ACEH ACEH BARAT 100492 98244 198736
#> 3 1105 11 05 ACEH ACEH TIMUR 212286 210115 422401
#> 4 1172 11 72 ACEH SABANG 20838 20359 41197
#> 5 1201 12 01 SUMATERA UTARA NIAS 71686 74986 146672
#> 6 1205 12 05 SUMATERA UTARA TAPANULI UTARA 156176 156582 312758
#> strata_kabkot flags tanggal
#> 1 1 U 28/09/24
#> 2 1 U 28/09/24
#> 3 1 U 28/09/24
#> 4 2 U 28/09/24
#> 5 1 U 28/09/24
#> 6 1 U 28/09/24
dtSampling_implicit$sampledf %>% nrow
#> [1] 100
dtSampling_implicit$sampledf %>% count(strata_kabkot)
#> strata_kabkot n
#> 1 1 63
#> 2 2 37
Sampling Details
head(dtSampling_implicit$details)
#> kdprov strata_kabkot jml_kabkota n_alloc n_primary npop ar k
#> 1 11 1 18 4 3 18 0.4643545 6.000000
#> 2 11 2 5 4 1 5 0.2667689 5.000000
#> 3 12 1 25 5 3 25 0.4806872 8.333333
#> 4 12 2 8 5 2 8 0.4282740 4.000000
#> 5 13 1 12 3 2 12 0.7073768 6.000000
#> 6 13 2 7 3 1 7 0.1943529 7.000000
#> n_deficit n_selected
#> 1 0 3
#> 2 0 1
#> 3 0 3
#> 4 0 2
#> 5 0 2
#> 6 0 1
Sampling with Predetermined Random Number
Sometimes, the random numbers for sampling have already been determined beforehand. Thus, for sampling using those predetermined random numbers, the samplingin
package accommodates this by adding the parameter predetermined_rn
, which takes the value of the variable storing the predetermined random numbers. For example, if the random numbers are stored in the allocation data frame under the variable name arand
, thus we add predetermined_rn = 'arand'
set.seed(988)
alokasi_dt_arand = alokasi_dt_strata %>%
mutate(arand = runif(n(),0,1))
alokasi_dt_arand %>% as.data.frame %>% head(10)
#> kdprov strata_kabkot jml_kabkota n_alloc n_primary arand
#> 1 11 1 18 4 3 0.769460780
#> 2 11 2 5 4 1 0.341945429
#> 3 12 1 25 5 3 0.424880783
#> 4 12 2 8 5 2 0.635811473
#> 5 13 1 12 3 2 0.552268938
#> 6 13 2 7 3 1 0.394085933
#> 7 14 1 10 3 2 0.875416729
#> 8 14 2 2 3 1 0.006720942
#> 9 15 1 9 3 2 0.402575672
#> 10 15 2 2 3 1 0.155631074
dtSampling_prn = doSampling(
pop = pop_dt_strata,
alloc = alokasi_dt_arand,
nsample = "n_primary",
seed = 974,
method = "systematic",
strata = "strata_kabkot",
predetermined_rn = "arand",
ident = c("kdprov"),
type = "U"
)
#> no implicit stratification variable chosen, sort by: kdprov and strata_kabkot
#> Negative allocation: 0
#> Zero allocation: 0
#> Using Predetermined Random Number on Allocation (arand)
#> kdprov 11 strata_kabkot 1
#> kdprov 11 strata_kabkot 2
#> kdprov 12 strata_kabkot 1
#> kdprov 12 strata_kabkot 2
#> kdprov 13 strata_kabkot 1
#> kdprov 13 strata_kabkot 2
#> kdprov 14 strata_kabkot 1
#> kdprov 14 strata_kabkot 2
#> kdprov 15 strata_kabkot 1
#> kdprov 15 strata_kabkot 2
#> kdprov 16 strata_kabkot 1
#> kdprov 16 strata_kabkot 2
#> kdprov 17 strata_kabkot 1
#> kdprov 17 strata_kabkot 2
#> kdprov 18 strata_kabkot 1
#> kdprov 18 strata_kabkot 2
#> kdprov 19 strata_kabkot 1
#> kdprov 19 strata_kabkot 2
#> kdprov 21 strata_kabkot 1
#> kdprov 21 strata_kabkot 2
#> kdprov 31 strata_kabkot 1
#> kdprov 31 strata_kabkot 2
#> kdprov 32 strata_kabkot 1
#> kdprov 32 strata_kabkot 2
#> kdprov 33 strata_kabkot 1
#> kdprov 33 strata_kabkot 2
#> kdprov 34 strata_kabkot 1
#> kdprov 34 strata_kabkot 2
#> kdprov 35 strata_kabkot 1
#> kdprov 35 strata_kabkot 2
#> kdprov 36 strata_kabkot 1
#> kdprov 36 strata_kabkot 2
#> kdprov 51 strata_kabkot 1
#> kdprov 51 strata_kabkot 2
#> kdprov 52 strata_kabkot 1
#> kdprov 52 strata_kabkot 2
#> kdprov 53 strata_kabkot 1
#> kdprov 53 strata_kabkot 2
#> kdprov 61 strata_kabkot 1
#> kdprov 61 strata_kabkot 2
#> kdprov 62 strata_kabkot 1
#> kdprov 62 strata_kabkot 2
#> kdprov 63 strata_kabkot 1
#> kdprov 63 strata_kabkot 2
#> kdprov 64 strata_kabkot 1
#> kdprov 64 strata_kabkot 2
#> kdprov 65 strata_kabkot 1
#> kdprov 65 strata_kabkot 2
#> kdprov 71 strata_kabkot 1
#> kdprov 71 strata_kabkot 2
#> kdprov 72 strata_kabkot 1
#> kdprov 72 strata_kabkot 2
#> kdprov 73 strata_kabkot 1
#> kdprov 73 strata_kabkot 2
#> kdprov 74 strata_kabkot 1
#> kdprov 74 strata_kabkot 2
#> kdprov 75 strata_kabkot 1
#> kdprov 75 strata_kabkot 2
#> kdprov 76 strata_kabkot 1
#> kdprov 81 strata_kabkot 1
#> kdprov 81 strata_kabkot 2
#> kdprov 82 strata_kabkot 1
#> kdprov 82 strata_kabkot 2
#> kdprov 91 strata_kabkot 1
#> kdprov 91 strata_kabkot 2
#> kdprov 94 strata_kabkot 1
#> kdprov 94 strata_kabkot 2
#> Joining with `by = join_by(kdprov, strata_kabkot)`
#> All allocations have been selected. Selected 100 out of 100 (100%)
Displaying the sampling result with predetermined random number
Population Sampled
head(dtSampling_prn$pop)
#> idkab kdprov kdkab nmprov nmkab Laki-laki Perempuan Total
#> 1 1101 11 01 ACEH SIMEULUE 47630 45235 92865
#> 2 1102 11 02 ACEH ACEH SINGKIL 63978 62536 126514
#> 3 1103 11 03 ACEH ACEH SELATAN 116542 115872 232414
#> 4 1104 11 04 ACEH ACEH TENGGARA 110799 110061 220860
#> 5 1105 11 05 ACEH ACEH TIMUR 212286 210115 422401
#> 6 1106 11 06 ACEH ACEH TENGAH 109262 106314 215576
#> strata_kabkot flags tanggal
#> 1 1 <NA> <NA>
#> 2 1 <NA> <NA>
#> 3 1 <NA> <NA>
#> 4 1 <NA> <NA>
#> 5 1 U 28/09/24
#> 6 1 <NA> <NA>
Units Sampled
head(dtSampling_prn$sampledf)
#> idkab kdprov kdkab nmprov nmkab Laki-laki Perempuan Total
#> 1 1105 11 05 ACEH ACEH TIMUR 212286 210115 422401
#> 2 1111 11 11 ACEH ACEH UTARA 301211 301582 602793
#> 3 1117 11 17 ACEH BENER MERIAH 81765 79577 161342
#> 4 1172 11 72 ACEH SABANG 20838 20359 41197
#> 5 1204 12 04 SUMATERA UTARA TAPANULI TENGAH 183814 181363 365177
#> 6 1212 12 12 SUMATERA UTARA DELI SERDANG 971735 959706 1931441
#> strata_kabkot flags tanggal
#> 1 1 U 28/09/24
#> 2 1 U 28/09/24
#> 3 1 U 28/09/24
#> 4 2 U 28/09/24
#> 5 1 U 28/09/24
#> 6 1 U 28/09/24
dtSampling_prn$sampledf %>% nrow
#> [1] 100
Sampling Details
head(dtSampling_prn$details)
#> kdprov strata_kabkot jml_kabkota n_alloc n_primary arand npop ar
#> 1 11 1 18 4 3 0.7694608 18 0.7694608
#> 2 11 2 5 4 1 0.3419454 5 0.3419454
#> 3 12 1 25 5 3 0.4248808 25 0.4248808
#> 4 12 2 8 5 2 0.6358115 8 0.6358115
#> 5 13 1 12 3 2 0.5522689 12 0.5522689
#> 6 13 2 7 3 1 0.3940859 7 0.3940859
#> k n_deficit n_selected
#> 1 6.000000 0 3
#> 2 5.000000 0 1
#> 3 8.333333 0 3
#> 4 4.000000 0 2
#> 5 6.000000 0 2
#> 6 7.000000 0 1
Allocate predetermined allocations to smaller levels
One of the supporting functions in the samplingin
package is get_allocation
. This function aims to allocate sample allocations to lower levels using the proportional allocation method based on the square root of the specified variable.
For example, sample allocations are available at the Province level, which will be allocated to lower levels such as Districts/Cities using the proportional allocation method based on the square root of the total population (Total
).
set.seed(242)
alokasi_prov = alokasi_dt %>%
select(-jml_kabkota, -n_primary) %>%
mutate(init_alloc = as.integer(runif(n(), 100, 200))) %>%
as.data.frame()
alokasi_prov %>% head(10)
#> kdprov init_alloc
#> 1 11 178
#> 2 12 100
#> 3 13 133
#> 4 14 168
#> 5 15 165
#> 6 16 176
#> 7 17 175
#> 8 18 192
#> 9 19 102
#> 10 21 164
alokasi_prov %>%
summarise(sum(init_alloc))
#> sum(init_alloc)
#> 1 5168
alokasi_kab = pop_dt %>%
left_join(alokasi_prov) %>%
get_allocation(n_alloc = "init_alloc", group = c("kdprov"), pop_var = "Total") %>%
as.data.frame()
#> Joining with `by = join_by(kdprov)`
alokasi_kab %>% head(10)
#> idkab kdprov kdkab nmprov nmkab Laki-laki Perempuan Total
#> 1 1101 11 01 ACEH SIMEULUE 47630 45235 92865
#> 2 1102 11 02 ACEH ACEH SINGKIL 63978 62536 126514
#> 3 1103 11 03 ACEH ACEH SELATAN 116542 115872 232414
#> 4 1104 11 04 ACEH ACEH TENGGARA 110799 110061 220860
#> 5 1105 11 05 ACEH ACEH TIMUR 212286 210115 422401
#> 6 1106 11 06 ACEH ACEH TENGAH 109262 106314 215576
#> 7 1107 11 07 ACEH ACEH BARAT 100492 98244 198736
#> 8 1108 11 08 ACEH ACEH BESAR 204428 201107 405535
#> 9 1109 11 09 ACEH PIDIE 215878 219397 435275
#> 10 1110 11 10 ACEH BIREUEN 215282 221136 436418
#> init_alloc n_primary
#> 1 178 5
#> 2 178 6
#> 3 178 8
#> 4 178 8
#> 5 178 11
#> 6 178 8
#> 7 178 8
#> 8 178 11
#> 9 178 11
#> 10 178 11
alokasi_kab %>% summarise(sum(n_primary))
#> sum(n_primary)
#> 1 5168
alokasi_kab %>%
group_by(kdprov) %>%
summarise(sum(n_primary))
#> # A tibble: 34 x 2
#> kdprov `sum(n_primary)`
#> <chr> <dbl>
#> 1 11 178
#> 2 12 100
#> 3 13 133
#> 4 14 168
#> # ... with 30 more rows
# check
all.equal(
alokasi_prov, alokasi_kab %>%
group_by(kdprov) %>%
summarise(init_alloc=sum(n_primary)) %>%
ungroup() %>%
as.data.frame()
)
#> [1] TRUE