Introduction to samplingin

Population DataFrame

We’ll use the dataset pop_dt. The dataset contains tabulation of Indonesia’s population based on the results of the 2020 population census by regency/city and gender from BPS-Statistics Indonesia https://sensus.bps.go.id/main/index/sp2020.

dim(pop_dt)
#> [1] 514   8

pop_dt %>% head()
#>   idkab kdprov kdkab nmprov         nmkab Laki-laki Perempuan  Total
#> 1  1101     11    01   ACEH      SIMEULUE     47630     45235  92865
#> 2  1102     11    02   ACEH  ACEH SINGKIL     63978     62536 126514
#> 3  1103     11    03   ACEH  ACEH SELATAN    116542    115872 232414
#> 4  1104     11    04   ACEH ACEH TENGGARA    110799    110061 220860
#> 5  1105     11    05   ACEH    ACEH TIMUR    212286    210115 422401
#> 6  1106     11    06   ACEH   ACEH TENGAH    109262    106314 215576

Allocation DataFrame

The dataset used is alokasi_dt which is a dataset consisting of sample allocations for each province for sampling purposes.

dim(alokasi_dt)
#> [1] 34  3

alokasi_dt
#> # A tibble: 34 x 3
#>   kdprov jml_kabkota n_primary
#>   <chr>        <int>     <dbl>
#> 1 11              23         4
#> 2 12              33         5
#> 3 13              19         3
#> 4 14              12         3
#> # ... with 30 more rows

Simple Random Sampling (SRS)

A simple random sample is a randomly selected subset of a population. In this sampling method, each member of the population has an exactly equal chance of being selected.

The following is the syntax for simple random sampling. Use parameter method = 'srs'

dtSampling_srs = doSampling(
  pop     = pop_dt,
  alloc   = alokasi_dt,
  nsample = "n_primary",
  seed    = 7891,
  method  = "srs",
  ident   = c("kdprov"),
  type    = "U"
)
#> no implicit stratification variable chosen, sort by: kdprov
#> Negative allocation: 0
#> Zero allocation: 0
#>   kdprov 11 
#>   kdprov 12 
#>   kdprov 13 
#>   kdprov 14 
#>   kdprov 15 
#>   kdprov 16 
#>   kdprov 17 
#>   kdprov 18 
#>   kdprov 19 
#>   kdprov 21 
#>   kdprov 31 
#>   kdprov 32 
#>   kdprov 33 
#>   kdprov 34 
#>   kdprov 35 
#>   kdprov 36 
#>   kdprov 51 
#>   kdprov 52 
#>   kdprov 53 
#>   kdprov 61 
#>   kdprov 62 
#>   kdprov 63 
#>   kdprov 64 
#>   kdprov 65 
#>   kdprov 71 
#>   kdprov 72 
#>   kdprov 73 
#>   kdprov 74 
#>   kdprov 75 
#>   kdprov 76 
#>   kdprov 81 
#>   kdprov 82 
#>   kdprov 91 
#>   kdprov 94
#> Joining with `by = join_by(kdprov, tmp_strata)`
#> All allocations have been selected. Selected 100 out of 100 (100%)

Displaying the primary sampling result

Population Sampled

head(dtSampling_srs$pop)
#>   idkab kdprov kdkab nmprov         nmkab Laki-laki Perempuan  Total flags
#> 1  1101     11    01   ACEH      SIMEULUE     47630     45235  92865     U
#> 2  1102     11    02   ACEH  ACEH SINGKIL     63978     62536 126514  <NA>
#> 3  1103     11    03   ACEH  ACEH SELATAN    116542    115872 232414  <NA>
#> 4  1104     11    04   ACEH ACEH TENGGARA    110799    110061 220860  <NA>
#> 5  1105     11    05   ACEH    ACEH TIMUR    212286    210115 422401  <NA>
#> 6  1106     11    06   ACEH   ACEH TENGAH    109262    106314 215576  <NA>
#>    tanggal
#> 1 28/09/24
#> 2     <NA>
#> 3     <NA>
#> 4     <NA>
#> 5     <NA>
#> 6     <NA>

Units Sampled

head(dtSampling_srs$sampledf)
#>   idkab kdprov kdkab         nmprov      nmkab Laki-laki Perempuan  Total flags
#> 1  1101     11    01           ACEH   SIMEULUE     47630     45235  92865     U
#> 2  1118     11    18           ACEH PIDIE JAYA     78742     79655 158397     U
#> 3  1171     11    71           ACEH BANDA ACEH    127435    125464 252899     U
#> 4  1172     11    72           ACEH     SABANG     20838     20359  41197     U
#> 5  1201     12    01 SUMATERA UTARA       NIAS     71686     74986 146672     U
#> 6  1224     12    24 SUMATERA UTARA NIAS UTARA     73216     74058 147274     U
#>    tanggal
#> 1 28/09/24
#> 2 28/09/24
#> 3 28/09/24
#> 4 28/09/24
#> 5 28/09/24
#> 6 28/09/24

dtSampling_srs$sampledf %>% nrow
#> [1] 100

Sampling Details

head(dtSampling_srs$details)
#>   kdprov jml_kabkota n_primary tmp_strata npop n_deficit n_selected
#> 1     11          23         4          1   23         0          4
#> 2     12          33         5          1   33         0          5
#> 3     13          19         3          1   19         0          3
#> 4     14          12         3          1   12         0          3
#> 5     15          11         3          1   11         0          3
#> 6     16          17         3          1   17         0          3

Systematic Random Sampling

Systematic random sampling is a method to select samples at a particular preset interval. Using population and allocation data that has been provided previously, we will carry out systematic random sampling by utilizing the doSampling function from samplingin package. Use parameter method = 'systematic'

Primary Units Sampling

The following is the syntax for sampling the primary units

dtSampling_u = doSampling(
  pop     = pop_dt,
  alloc   = alokasi_dt,
  nsample = "n_primary",
  seed    = 2,
  method  = "systematic",
  ident   = c("kdprov"),
  type    = "U"
)
#> no implicit stratification variable chosen, sort by: kdprov
#> Negative allocation: 0
#> Zero allocation: 0
#>   kdprov 11 
#>   kdprov 12 
#>   kdprov 13 
#>   kdprov 14 
#>   kdprov 15 
#>   kdprov 16 
#>   kdprov 17 
#>   kdprov 18 
#>   kdprov 19 
#>   kdprov 21 
#>   kdprov 31 
#>   kdprov 32 
#>   kdprov 33 
#>   kdprov 34 
#>   kdprov 35 
#>   kdprov 36 
#>   kdprov 51 
#>   kdprov 52 
#>   kdprov 53 
#>   kdprov 61 
#>   kdprov 62 
#>   kdprov 63 
#>   kdprov 64 
#>   kdprov 65 
#>   kdprov 71 
#>   kdprov 72 
#>   kdprov 73 
#>   kdprov 74 
#>   kdprov 75 
#>   kdprov 76 
#>   kdprov 81 
#>   kdprov 82 
#>   kdprov 91 
#>   kdprov 94
#> Joining with `by = join_by(kdprov, tmp_strata)`
#> All allocations have been selected. Selected 100 out of 100 (100%)

Displaying the primary sampling result

Population Sampled

head(dtSampling_u$pop)
#>   idkab kdprov kdkab nmprov         nmkab Laki-laki Perempuan  Total flags
#> 1  1101     11    01   ACEH      SIMEULUE     47630     45235  92865     U
#> 2  1102     11    02   ACEH  ACEH SINGKIL     63978     62536 126514  <NA>
#> 3  1103     11    03   ACEH  ACEH SELATAN    116542    115872 232414  <NA>
#> 4  1104     11    04   ACEH ACEH TENGGARA    110799    110061 220860  <NA>
#> 5  1105     11    05   ACEH    ACEH TIMUR    212286    210115 422401  <NA>
#> 6  1106     11    06   ACEH   ACEH TENGAH    109262    106314 215576  <NA>
#>    tanggal
#> 1 28/09/24
#> 2     <NA>
#> 3     <NA>
#> 4     <NA>
#> 5     <NA>
#> 6     <NA>

Units Sampled

head(dtSampling_u$sampledf)
#>   idkab kdprov kdkab         nmprov          nmkab Laki-laki Perempuan  Total
#> 1  1101     11    01           ACEH       SIMEULUE     47630     45235  92865
#> 2  1107     11    07           ACEH     ACEH BARAT    100492     98244 198736
#> 3  1113     11    13           ACEH      GAYO LUES     50026     49506  99532
#> 4  1118     11    18           ACEH     PIDIE JAYA     78742     79655 158397
#> 5  1205     12    05 SUMATERA UTARA TAPANULI UTARA    156176    156582 312758
#> 6  1211     12    11 SUMATERA UTARA           KARO    200247    204751 404998
#>   flags  tanggal
#> 1     U 28/09/24
#> 2     U 28/09/24
#> 3     U 28/09/24
#> 4     U 28/09/24
#> 5     U 28/09/24
#> 6     U 28/09/24

dtSampling_u$sampledf %>% nrow
#> [1] 100

Sampling Details

head(dtSampling_u$details)
#>   kdprov jml_kabkota n_primary tmp_strata npop        ar        k n_deficit
#> 1     11          23         4          1   23 0.1848823 5.750000         0
#> 2     12          33         5          1   33 0.7023740 6.600000         0
#> 3     13          19         3          1   19 0.5733263 6.333333         0
#> 4     14          12         3          1   12 0.1680519 4.000000         0
#> 5     15          11         3          1   11 0.9438393 3.666667         0
#> 6     16          17         3          1   17 0.9434750 5.666667         0
#>   n_selected
#> 1          4
#> 2          5
#> 3          3
#> 4          3
#> 5          3
#> 6          3

Secondary Units Sampling

To perform sampling for secondary units, we utilize the population results from prior sampling, which have been marked for the selected primary units. Parameters in doSampling are added with is_secondary=TRUE.

alokasi_dt_p = alokasi_dt %>% 
  mutate(n_secondary = 2*n_primary)

dtSampling_p = doSampling(
  pop     = dtSampling_u$pop,
  alloc   = alokasi_dt_p,
  nsample = "n_secondary",
  seed    = 243,
  method  = "systematic",
  ident   = c("kdprov"),
  type    = "P",
  is_secondary = TRUE
)
#> no implicit stratification variable chosen, sort by: kdprov
#> Negative allocation: 0
#> Zero allocation: 0
#> Sampling Secondary Units
#>   kdprov 11 
#>   kdprov 12 
#>   kdprov 13 
#>   kdprov 14 
#>   kdprov 15 
#>   kdprov 16 
#>   kdprov 17 
#>   kdprov 18 
#>   kdprov 19 
#>   kdprov 21 
#>   kdprov 31 
#>   kdprov 32 
#>   kdprov 33 
#>   kdprov 34 
#>   kdprov 35 
#>   kdprov 36 
#>   kdprov 51 
#>   kdprov 52 
#>   kdprov 53 
#>   kdprov 61 
#>   kdprov 62 
#>   kdprov 63 
#>   kdprov 64 
#>   kdprov 65 
#>   kdprov 71 
#>   kdprov 72 
#>   kdprov 73 
#>   kdprov 74 
#>   kdprov 75 
#>   kdprov 76 
#>   kdprov 81 
#>   kdprov 82 
#>   kdprov 91 
#>   kdprov 94
#> Joining with `by = join_by(kdprov, tmp_strata, is_secondary_tmp)`
#> WARNING: There are still 2 allocations for which samples have not been selected. Selected 198 out of 200 (99%)

It can be seen that there are still 2 units that have not been selected as samples. To view the allocation that has not yet been selected as samples, it is as follows:

dtSampling_p$details %>% 
  filter(n_deficit>0)
#>   kdprov jml_kabkota n_primary n_secondary tmp_strata npop         ar    k
#> 1     34           5         2           4          1    3 0.05382431 0.75
#> 2     65           5         2           4          1    3 0.42914161 0.75
#>   n_deficit n_selected
#> 1         1          3
#> 2         1          3

Displaying the secondary sampling result

Population Sampled

head(dtSampling_p$pop)
#>   idkab kdprov kdkab nmprov         nmkab Laki-laki Perempuan  Total flags
#> 1  1101     11    01   ACEH      SIMEULUE     47630     45235  92865     U
#> 2  1102     11    02   ACEH  ACEH SINGKIL     63978     62536 126514     P
#> 3  1103     11    03   ACEH  ACEH SELATAN    116542    115872 232414  <NA>
#> 4  1104     11    04   ACEH ACEH TENGGARA    110799    110061 220860     P
#> 5  1105     11    05   ACEH    ACEH TIMUR    212286    210115 422401  <NA>
#> 6  1106     11    06   ACEH   ACEH TENGAH    109262    106314 215576  <NA>
#>    tanggal
#> 1 28/09/24
#> 2 28/09/24
#> 3     <NA>
#> 4 28/09/24
#> 5     <NA>
#> 6     <NA>

Flags for primary and secondary units

dtSampling_p$pop %>% count(flags)
#>   flags   n
#> 1     P 198
#> 2     U 100
#> 3  <NA> 216

Units Sampled

head(dtSampling_p$sampledf)
#>   idkab kdprov kdkab nmprov           nmkab Laki-laki Perempuan  Total flags
#> 1  1102     11    02   ACEH    ACEH SINGKIL     63978     62536 126514     P
#> 2  1104     11    04   ACEH   ACEH TENGGARA    110799    110061 220860     P
#> 3  1108     11    08   ACEH      ACEH BESAR    204428    201107 405535     P
#> 4  1110     11    10   ACEH         BIREUEN    215282    221136 436418     P
#> 5  1112     11    12   ACEH ACEH BARAT DAYA     76254     74521 150775     P
#> 6  1116     11    16   ACEH       ACEH JAYA     47264     45895  93159     P
#>    tanggal
#> 1 28/09/24
#> 2 28/09/24
#> 3 28/09/24
#> 4 28/09/24
#> 5 28/09/24
#> 6 28/09/24

dtSampling_p$sampledf %>% nrow
#> [1] 198

Sampling Details

head(dtSampling_p$details)
#>   kdprov jml_kabkota n_primary n_secondary tmp_strata npop        ar        k
#> 1     11          23         4           8          1   19 0.3700528 2.375000
#> 2     12          33         5          10          1   28 0.4498103 2.800000
#> 3     13          19         3           6          1   16 0.2502315 2.666667
#> 4     14          12         3           6          1    9 0.6223435 1.500000
#> 5     15          11         3           6          1    8 0.7630308 1.333333
#> 6     16          17         3           6          1   14 0.7818968 2.333333
#>   n_deficit n_selected
#> 1         0          8
#> 2         0         10
#> 3         0          6
#> 4         0          6
#> 5         0          6
#> 6         0          6

PPS Systematic Sampling

PPS systematic sampling is a method of sampling from a finite population in which a size measure is available for each population unit before sampling and where the probability of selecting a unit is proportional to its size. Units with larger sizes have more chance to be selected. We will use doSampling function with parameter method = 'pps' and auxVar = 'Total' for its auxiliary variable.

dtSampling_pps = doSampling(
  pop     = pop_dt,
  alloc   = alokasi_dt,
  nsample = "n_primary",
  seed    = 321,
  method  = "pps",
  auxVar  = "Total",
  ident   = c("kdprov"),
  type    = "U"
)
#> no implicit stratification variable chosen, sort by: kdprov
#> Negative allocation: 0
#> Zero allocation: 0
#> Joining with `by = join_by(kdprov, tmp_strata)`
#>   kdprov 11 
#>   kdprov 12 
#>   kdprov 13 
#>   kdprov 14 
#>   kdprov 15 
#>   kdprov 16 
#>   kdprov 17 
#>   kdprov 18 
#>   kdprov 19 
#>   kdprov 21 
#>   kdprov 31 
#>   kdprov 32 
#>   kdprov 33 
#>   kdprov 34 
#>   kdprov 35 
#>   kdprov 36 
#>   kdprov 51 
#>   kdprov 52 
#>   kdprov 53 
#>   kdprov 61 
#>   kdprov 62 
#>   kdprov 63 
#>   kdprov 64 
#>   kdprov 65 
#>   kdprov 71 
#>   kdprov 72 
#>   kdprov 73 
#>   kdprov 74 
#>   kdprov 75 
#>   kdprov 76 
#>   kdprov 81 
#>   kdprov 82 
#>   kdprov 91 
#>   kdprov 94
#> Joining with `by = join_by(kdprov, tmp_strata)`
#> All allocations have been selected. Selected 100 out of 100 (100%)

Displaying the PPS sampling result

Population Sampled

head(dtSampling_pps$pop)
#>   idkab kdprov kdkab nmprov         nmkab Laki-laki Perempuan  Total flags
#> 1  1101     11    01   ACEH      SIMEULUE     47630     45235  92865  <NA>
#> 2  1102     11    02   ACEH  ACEH SINGKIL     63978     62536 126514  <NA>
#> 3  1103     11    03   ACEH  ACEH SELATAN    116542    115872 232414  <NA>
#> 4  1104     11    04   ACEH ACEH TENGGARA    110799    110061 220860  <NA>
#> 5  1105     11    05   ACEH    ACEH TIMUR    212286    210115 422401  <NA>
#> 6  1106     11    06   ACEH   ACEH TENGAH    109262    106314 215576     U
#>    tanggal
#> 1     <NA>
#> 2     <NA>
#> 3     <NA>
#> 4     <NA>
#> 5     <NA>
#> 6 28/09/24

Units Sampled

head(dtSampling_pps$sampledf)
#>   idkab kdprov kdkab         nmprov        nmkab Laki-laki Perempuan   Total
#> 1  1106     11    06           ACEH  ACEH TENGAH    109262    106314  215576
#> 2  1110     11    10           ACEH      BIREUEN    215282    221136  436418
#> 3  1114     11    14           ACEH ACEH TAMIANG    149263    145093  294356
#> 4  1175     11    75           ACEH SUBULUSSALAM     46065     44686   90751
#> 5  1208     12    08 SUMATERA UTARA       ASAHAN    389391    380569  769960
#> 6  1212     12    12 SUMATERA UTARA DELI SERDANG    971735    959706 1931441
#>   flags  tanggal
#> 1     U 28/09/24
#> 2     U 28/09/24
#> 3     U 28/09/24
#> 4     U 28/09/24
#> 5     U 28/09/24
#> 6     U 28/09/24

dtSampling_pps$sampledf %>% nrow
#> [1] 100

Sampling Details

head(dtSampling_pps$details)
#>   kdprov jml_kabkota n_primary tmp_strata npop        ar        k n_deficit
#> 1     11          23         4          1   23 0.9558938 5.750000         0
#> 2     12          33         5          1   33 0.9372855 6.600000         0
#> 3     13          19         3          1   19 0.2382205 6.333333         0
#> 4     14          12         3          1   12 0.2550736 4.000000         0
#> 5     15          11         3          1   11 0.3905120 3.666667         0
#> 6     16          17         3          1   17 0.3411799 5.666667         0
#>   n_selected
#> 1          4
#> 2          5
#> 3          3
#> 4          3
#> 5          3
#> 6          3

Sampling using Stratification

For sampling that utilizes stratification, the doSampling function includes additional parameter called strata. The strata variable must be available in the population and the allocation being used. For example, in the pop_dt data, information about strata is added, namely strata_kabkot, which indicates information about districts (strata_kabkot = 1) and cities (strata_kabkot = 2).

pop_dt_strata = pop_dt %>% 
  mutate(
    strata_kabkot = ifelse(substr(kdkab,1,1)=='7', 2, 1)
  )

alokasi_dt_strata = pop_dt_strata %>% 
  group_by(kdprov,strata_kabkot) %>% 
  summarise(
    jml_kabkota = n()
  ) %>% 
  ungroup %>% 
  left_join(
    alokasi_dt %>% 
      select(kdprov,n_primary) %>% 
      rename(n_alloc = n_primary)
  )
#> `summarise()` has grouped output by 'kdprov'. You can override using the
#> `.groups` argument.
#> Joining with `by = join_by(kdprov)`
  
alokasi_dt_strata = alokasi_dt_strata %>%
  get_allocation(n_alloc = "n_alloc", group = c("kdprov"), pop_var = "jml_kabkota")

dtSampling_strata = doSampling(
  pop     = pop_dt_strata,
  alloc   = alokasi_dt_strata,
  nsample = "n_primary",
  seed    = 3512,
  method  = "systematic",
  strata  = "strata_kabkot",
  ident   = c("kdprov"),
  type    = "U"
)
#> no implicit stratification variable chosen, sort by: kdprov and strata_kabkot
#> 
#> Negative allocation: 0
#> 
#> Zero allocation: 0
#>   kdprov 11  strata_kabkot 1 
#>   kdprov 11  strata_kabkot 2 
#>   kdprov 12  strata_kabkot 1 
#>   kdprov 12  strata_kabkot 2 
#>   kdprov 13  strata_kabkot 1 
#>   kdprov 13  strata_kabkot 2 
#>   kdprov 14  strata_kabkot 1 
#>   kdprov 14  strata_kabkot 2 
#>   kdprov 15  strata_kabkot 1 
#>   kdprov 15  strata_kabkot 2 
#>   kdprov 16  strata_kabkot 1 
#>   kdprov 16  strata_kabkot 2 
#>   kdprov 17  strata_kabkot 1 
#>   kdprov 17  strata_kabkot 2 
#>   kdprov 18  strata_kabkot 1 
#>   kdprov 18  strata_kabkot 2 
#>   kdprov 19  strata_kabkot 1 
#>   kdprov 19  strata_kabkot 2 
#>   kdprov 21  strata_kabkot 1 
#>   kdprov 21  strata_kabkot 2 
#>   kdprov 31  strata_kabkot 1 
#>   kdprov 31  strata_kabkot 2 
#>   kdprov 32  strata_kabkot 1 
#>   kdprov 32  strata_kabkot 2 
#>   kdprov 33  strata_kabkot 1 
#>   kdprov 33  strata_kabkot 2 
#>   kdprov 34  strata_kabkot 1 
#>   kdprov 34  strata_kabkot 2 
#>   kdprov 35  strata_kabkot 1 
#>   kdprov 35  strata_kabkot 2 
#>   kdprov 36  strata_kabkot 1 
#>   kdprov 36  strata_kabkot 2 
#>   kdprov 51  strata_kabkot 1 
#>   kdprov 51  strata_kabkot 2 
#>   kdprov 52  strata_kabkot 1 
#>   kdprov 52  strata_kabkot 2 
#>   kdprov 53  strata_kabkot 1 
#>   kdprov 53  strata_kabkot 2 
#>   kdprov 61  strata_kabkot 1 
#>   kdprov 61  strata_kabkot 2 
#>   kdprov 62  strata_kabkot 1 
#>   kdprov 62  strata_kabkot 2 
#>   kdprov 63  strata_kabkot 1 
#>   kdprov 63  strata_kabkot 2 
#>   kdprov 64  strata_kabkot 1 
#>   kdprov 64  strata_kabkot 2 
#>   kdprov 65  strata_kabkot 1 
#>   kdprov 65  strata_kabkot 2 
#>   kdprov 71  strata_kabkot 1 
#>   kdprov 71  strata_kabkot 2 
#>   kdprov 72  strata_kabkot 1 
#>   kdprov 72  strata_kabkot 2 
#>   kdprov 73  strata_kabkot 1 
#>   kdprov 73  strata_kabkot 2 
#>   kdprov 74  strata_kabkot 1 
#>   kdprov 74  strata_kabkot 2 
#>   kdprov 75  strata_kabkot 1 
#>   kdprov 75  strata_kabkot 2 
#>   kdprov 76  strata_kabkot 1 
#>   kdprov 81  strata_kabkot 1 
#>   kdprov 81  strata_kabkot 2 
#>   kdprov 82  strata_kabkot 1 
#>   kdprov 82  strata_kabkot 2 
#>   kdprov 91  strata_kabkot 1 
#>   kdprov 91  strata_kabkot 2 
#>   kdprov 94  strata_kabkot 1 
#>   kdprov 94  strata_kabkot 2
#> Joining with `by = join_by(kdprov, strata_kabkot)`
#> All allocations have been selected. Selected 100 out of 100 (100%)

Displaying the sampling result with stratification

Population Sampled

head(dtSampling_strata$pop)
#>   idkab kdprov kdkab nmprov         nmkab Laki-laki Perempuan  Total
#> 1  1101     11    01   ACEH      SIMEULUE     47630     45235  92865
#> 2  1102     11    02   ACEH  ACEH SINGKIL     63978     62536 126514
#> 3  1103     11    03   ACEH  ACEH SELATAN    116542    115872 232414
#> 4  1104     11    04   ACEH ACEH TENGGARA    110799    110061 220860
#> 5  1105     11    05   ACEH    ACEH TIMUR    212286    210115 422401
#> 6  1106     11    06   ACEH   ACEH TENGAH    109262    106314 215576
#>   strata_kabkot flags  tanggal
#> 1             1  <NA>     <NA>
#> 2             1  <NA>     <NA>
#> 3             1     U 28/09/24
#> 4             1  <NA>     <NA>
#> 5             1  <NA>     <NA>
#> 6             1  <NA>     <NA>

Units Sampled

head(dtSampling_strata$sampledf)
#>   idkab kdprov kdkab         nmprov           nmkab Laki-laki Perempuan   Total
#> 1  1103     11    03           ACEH    ACEH SELATAN    116542    115872  232414
#> 2  1109     11    09           ACEH           PIDIE    215878    219397  435275
#> 3  1115     11    15           ACEH      NAGAN RAYA     85039     83353  168392
#> 4  1171     11    71           ACEH      BANDA ACEH    127435    125464  252899
#> 5  1204     12    04 SUMATERA UTARA TAPANULI TENGAH    183814    181363  365177
#> 6  1212     12    12 SUMATERA UTARA    DELI SERDANG    971735    959706 1931441
#>   strata_kabkot flags  tanggal
#> 1             1     U 28/09/24
#> 2             1     U 28/09/24
#> 3             1     U 28/09/24
#> 4             2     U 28/09/24
#> 5             1     U 28/09/24
#> 6             1     U 28/09/24

dtSampling_strata$sampledf %>% nrow
#> [1] 100

dtSampling_strata$sampledf %>% count(strata_kabkot)
#>   strata_kabkot  n
#> 1             1 63
#> 2             2 37

Sampling Details

head(dtSampling_strata$details)
#>   kdprov strata_kabkot jml_kabkota n_alloc n_primary npop        ar        k
#> 1     11             1          18       4         3   18 0.4643545 6.000000
#> 2     11             2           5       4         1    5 0.2667689 5.000000
#> 3     12             1          25       5         3   25 0.4806872 8.333333
#> 4     12             2           8       5         2    8 0.4282740 4.000000
#> 5     13             1          12       3         2   12 0.7073768 6.000000
#> 6     13             2           7       3         1    7 0.1943529 7.000000
#>   n_deficit n_selected
#> 1         0          3
#> 2         0          1
#> 3         0          3
#> 4         0          2
#> 5         0          2
#> 6         0          1

Sampling with Implicit Stratification

So that the characteristics of the selected sample are distributed according to certain variables, sampling sometimes employs implicit stratification. For instance, if you aim to obtain samples distributed according to the total population, you can add the parameter implicitby = 'Total' when conducting sampling.

dtSampling_implicit = doSampling(
  pop        = pop_dt_strata,
  alloc      = alokasi_dt_strata,
  nsample    = "n_primary",
  seed       = 3512,
  method     = "systematic",
  strata     = "strata_kabkot",
  implicitby = "Total",
  ident      = c("kdprov"),
  type       = "U"
)
#> sort by: kdprov, strata_kabkot and Total
#> Negative allocation: 0
#> Zero allocation: 0
#>   kdprov 11  strata_kabkot 1 
#>   kdprov 11  strata_kabkot 2 
#>   kdprov 12  strata_kabkot 1 
#>   kdprov 12  strata_kabkot 2 
#>   kdprov 13  strata_kabkot 1 
#>   kdprov 13  strata_kabkot 2 
#>   kdprov 14  strata_kabkot 1 
#>   kdprov 14  strata_kabkot 2 
#>   kdprov 15  strata_kabkot 1 
#>   kdprov 15  strata_kabkot 2 
#>   kdprov 16  strata_kabkot 1 
#>   kdprov 16  strata_kabkot 2 
#>   kdprov 17  strata_kabkot 1 
#>   kdprov 17  strata_kabkot 2 
#>   kdprov 18  strata_kabkot 1 
#>   kdprov 18  strata_kabkot 2 
#>   kdprov 19  strata_kabkot 1 
#>   kdprov 19  strata_kabkot 2 
#>   kdprov 21  strata_kabkot 1 
#>   kdprov 21  strata_kabkot 2 
#>   kdprov 31  strata_kabkot 1 
#>   kdprov 31  strata_kabkot 2 
#>   kdprov 32  strata_kabkot 1 
#>   kdprov 32  strata_kabkot 2 
#>   kdprov 33  strata_kabkot 1 
#>   kdprov 33  strata_kabkot 2 
#>   kdprov 34  strata_kabkot 1 
#>   kdprov 34  strata_kabkot 2 
#>   kdprov 35  strata_kabkot 1 
#>   kdprov 35  strata_kabkot 2 
#>   kdprov 36  strata_kabkot 1 
#>   kdprov 36  strata_kabkot 2 
#>   kdprov 51  strata_kabkot 1 
#>   kdprov 51  strata_kabkot 2 
#>   kdprov 52  strata_kabkot 1 
#>   kdprov 52  strata_kabkot 2 
#>   kdprov 53  strata_kabkot 1 
#>   kdprov 53  strata_kabkot 2 
#>   kdprov 61  strata_kabkot 1 
#>   kdprov 61  strata_kabkot 2 
#>   kdprov 62  strata_kabkot 1 
#>   kdprov 62  strata_kabkot 2 
#>   kdprov 63  strata_kabkot 1 
#>   kdprov 63  strata_kabkot 2 
#>   kdprov 64  strata_kabkot 1 
#>   kdprov 64  strata_kabkot 2 
#>   kdprov 65  strata_kabkot 1 
#>   kdprov 65  strata_kabkot 2 
#>   kdprov 71  strata_kabkot 1 
#>   kdprov 71  strata_kabkot 2 
#>   kdprov 72  strata_kabkot 1 
#>   kdprov 72  strata_kabkot 2 
#>   kdprov 73  strata_kabkot 1 
#>   kdprov 73  strata_kabkot 2 
#>   kdprov 74  strata_kabkot 1 
#>   kdprov 74  strata_kabkot 2 
#>   kdprov 75  strata_kabkot 1 
#>   kdprov 75  strata_kabkot 2 
#>   kdprov 76  strata_kabkot 1 
#>   kdprov 81  strata_kabkot 1 
#>   kdprov 81  strata_kabkot 2 
#>   kdprov 82  strata_kabkot 1 
#>   kdprov 82  strata_kabkot 2 
#>   kdprov 91  strata_kabkot 1 
#>   kdprov 91  strata_kabkot 2 
#>   kdprov 94  strata_kabkot 1 
#>   kdprov 94  strata_kabkot 2
#> Joining with `by = join_by(kdprov, strata_kabkot)`
#> All allocations have been selected. Selected 100 out of 100 (100%)

Displaying the sampling result with implicit stratification

Population Sampled

head(dtSampling_implicit$pop)
#>   idkab kdprov kdkab nmprov           nmkab Laki-laki Perempuan  Total
#> 1  1101     11    01   ACEH        SIMEULUE     47630     45235  92865
#> 2  1116     11    16   ACEH       ACEH JAYA     47264     45895  93159
#> 3  1113     11    13   ACEH       GAYO LUES     50026     49506  99532
#> 4  1102     11    02   ACEH    ACEH SINGKIL     63978     62536 126514
#> 5  1112     11    12   ACEH ACEH BARAT DAYA     76254     74521 150775
#> 6  1118     11    18   ACEH      PIDIE JAYA     78742     79655 158397
#>   strata_kabkot flags  tanggal
#> 1             1  <NA>     <NA>
#> 2             1  <NA>     <NA>
#> 3             1     U 28/09/24
#> 4             1  <NA>     <NA>
#> 5             1  <NA>     <NA>
#> 6             1  <NA>     <NA>

Units Sampled

head(dtSampling_implicit$sampledf)
#>   idkab kdprov kdkab         nmprov          nmkab Laki-laki Perempuan  Total
#> 1  1113     11    13           ACEH      GAYO LUES     50026     49506  99532
#> 2  1107     11    07           ACEH     ACEH BARAT    100492     98244 198736
#> 3  1105     11    05           ACEH     ACEH TIMUR    212286    210115 422401
#> 4  1172     11    72           ACEH         SABANG     20838     20359  41197
#> 5  1201     12    01 SUMATERA UTARA           NIAS     71686     74986 146672
#> 6  1205     12    05 SUMATERA UTARA TAPANULI UTARA    156176    156582 312758
#>   strata_kabkot flags  tanggal
#> 1             1     U 28/09/24
#> 2             1     U 28/09/24
#> 3             1     U 28/09/24
#> 4             2     U 28/09/24
#> 5             1     U 28/09/24
#> 6             1     U 28/09/24

dtSampling_implicit$sampledf %>% nrow
#> [1] 100

dtSampling_implicit$sampledf %>% count(strata_kabkot)
#>   strata_kabkot  n
#> 1             1 63
#> 2             2 37

Sampling Details

head(dtSampling_implicit$details)
#>   kdprov strata_kabkot jml_kabkota n_alloc n_primary npop        ar        k
#> 1     11             1          18       4         3   18 0.4643545 6.000000
#> 2     11             2           5       4         1    5 0.2667689 5.000000
#> 3     12             1          25       5         3   25 0.4806872 8.333333
#> 4     12             2           8       5         2    8 0.4282740 4.000000
#> 5     13             1          12       3         2   12 0.7073768 6.000000
#> 6     13             2           7       3         1    7 0.1943529 7.000000
#>   n_deficit n_selected
#> 1         0          3
#> 2         0          1
#> 3         0          3
#> 4         0          2
#> 5         0          2
#> 6         0          1

Sampling with Predetermined Random Number

Sometimes, the random numbers for sampling have already been determined beforehand. Thus, for sampling using those predetermined random numbers, the samplingin package accommodates this by adding the parameter predetermined_rn, which takes the value of the variable storing the predetermined random numbers. For example, if the random numbers are stored in the allocation data frame under the variable name arand, thus we add predetermined_rn = 'arand'

set.seed(988)
alokasi_dt_arand = alokasi_dt_strata %>%
  mutate(arand = runif(n(),0,1))

alokasi_dt_arand %>% as.data.frame %>% head(10)
#>    kdprov strata_kabkot jml_kabkota n_alloc n_primary       arand
#> 1      11             1          18       4         3 0.769460780
#> 2      11             2           5       4         1 0.341945429
#> 3      12             1          25       5         3 0.424880783
#> 4      12             2           8       5         2 0.635811473
#> 5      13             1          12       3         2 0.552268938
#> 6      13             2           7       3         1 0.394085933
#> 7      14             1          10       3         2 0.875416729
#> 8      14             2           2       3         1 0.006720942
#> 9      15             1           9       3         2 0.402575672
#> 10     15             2           2       3         1 0.155631074

dtSampling_prn = doSampling(
  pop        = pop_dt_strata,
  alloc      = alokasi_dt_arand,
  nsample    = "n_primary",
  seed       = 974,
  method     = "systematic",
  strata     = "strata_kabkot",
  predetermined_rn = "arand",
  ident      = c("kdprov"),
  type       = "U"
)
#> no implicit stratification variable chosen, sort by: kdprov and strata_kabkot
#> Negative allocation: 0
#> Zero allocation: 0
#> Using Predetermined Random Number on Allocation (arand)
#>   kdprov 11  strata_kabkot 1 
#>   kdprov 11  strata_kabkot 2 
#>   kdprov 12  strata_kabkot 1 
#>   kdprov 12  strata_kabkot 2 
#>   kdprov 13  strata_kabkot 1 
#>   kdprov 13  strata_kabkot 2 
#>   kdprov 14  strata_kabkot 1 
#>   kdprov 14  strata_kabkot 2 
#>   kdprov 15  strata_kabkot 1 
#>   kdprov 15  strata_kabkot 2 
#>   kdprov 16  strata_kabkot 1 
#>   kdprov 16  strata_kabkot 2 
#>   kdprov 17  strata_kabkot 1 
#>   kdprov 17  strata_kabkot 2 
#>   kdprov 18  strata_kabkot 1 
#>   kdprov 18  strata_kabkot 2 
#>   kdprov 19  strata_kabkot 1 
#>   kdprov 19  strata_kabkot 2 
#>   kdprov 21  strata_kabkot 1 
#>   kdprov 21  strata_kabkot 2 
#>   kdprov 31  strata_kabkot 1 
#>   kdprov 31  strata_kabkot 2 
#>   kdprov 32  strata_kabkot 1 
#>   kdprov 32  strata_kabkot 2 
#>   kdprov 33  strata_kabkot 1 
#>   kdprov 33  strata_kabkot 2 
#>   kdprov 34  strata_kabkot 1 
#>   kdprov 34  strata_kabkot 2 
#>   kdprov 35  strata_kabkot 1 
#>   kdprov 35  strata_kabkot 2 
#>   kdprov 36  strata_kabkot 1 
#>   kdprov 36  strata_kabkot 2 
#>   kdprov 51  strata_kabkot 1 
#>   kdprov 51  strata_kabkot 2 
#>   kdprov 52  strata_kabkot 1 
#>   kdprov 52  strata_kabkot 2 
#>   kdprov 53  strata_kabkot 1 
#>   kdprov 53  strata_kabkot 2 
#>   kdprov 61  strata_kabkot 1 
#>   kdprov 61  strata_kabkot 2 
#>   kdprov 62  strata_kabkot 1 
#>   kdprov 62  strata_kabkot 2 
#>   kdprov 63  strata_kabkot 1 
#>   kdprov 63  strata_kabkot 2 
#>   kdprov 64  strata_kabkot 1 
#>   kdprov 64  strata_kabkot 2 
#>   kdprov 65  strata_kabkot 1 
#>   kdprov 65  strata_kabkot 2 
#>   kdprov 71  strata_kabkot 1 
#>   kdprov 71  strata_kabkot 2 
#>   kdprov 72  strata_kabkot 1 
#>   kdprov 72  strata_kabkot 2 
#>   kdprov 73  strata_kabkot 1 
#>   kdprov 73  strata_kabkot 2 
#>   kdprov 74  strata_kabkot 1 
#>   kdprov 74  strata_kabkot 2 
#>   kdprov 75  strata_kabkot 1 
#>   kdprov 75  strata_kabkot 2 
#>   kdprov 76  strata_kabkot 1 
#>   kdprov 81  strata_kabkot 1 
#>   kdprov 81  strata_kabkot 2 
#>   kdprov 82  strata_kabkot 1 
#>   kdprov 82  strata_kabkot 2 
#>   kdprov 91  strata_kabkot 1 
#>   kdprov 91  strata_kabkot 2 
#>   kdprov 94  strata_kabkot 1 
#>   kdprov 94  strata_kabkot 2
#> Joining with `by = join_by(kdprov, strata_kabkot)`
#> All allocations have been selected. Selected 100 out of 100 (100%)

Displaying the sampling result with predetermined random number

Population Sampled

head(dtSampling_prn$pop)
#>   idkab kdprov kdkab nmprov         nmkab Laki-laki Perempuan  Total
#> 1  1101     11    01   ACEH      SIMEULUE     47630     45235  92865
#> 2  1102     11    02   ACEH  ACEH SINGKIL     63978     62536 126514
#> 3  1103     11    03   ACEH  ACEH SELATAN    116542    115872 232414
#> 4  1104     11    04   ACEH ACEH TENGGARA    110799    110061 220860
#> 5  1105     11    05   ACEH    ACEH TIMUR    212286    210115 422401
#> 6  1106     11    06   ACEH   ACEH TENGAH    109262    106314 215576
#>   strata_kabkot flags  tanggal
#> 1             1  <NA>     <NA>
#> 2             1  <NA>     <NA>
#> 3             1  <NA>     <NA>
#> 4             1  <NA>     <NA>
#> 5             1     U 28/09/24
#> 6             1  <NA>     <NA>

Units Sampled

head(dtSampling_prn$sampledf)
#>   idkab kdprov kdkab         nmprov           nmkab Laki-laki Perempuan   Total
#> 1  1105     11    05           ACEH      ACEH TIMUR    212286    210115  422401
#> 2  1111     11    11           ACEH      ACEH UTARA    301211    301582  602793
#> 3  1117     11    17           ACEH    BENER MERIAH     81765     79577  161342
#> 4  1172     11    72           ACEH          SABANG     20838     20359   41197
#> 5  1204     12    04 SUMATERA UTARA TAPANULI TENGAH    183814    181363  365177
#> 6  1212     12    12 SUMATERA UTARA    DELI SERDANG    971735    959706 1931441
#>   strata_kabkot flags  tanggal
#> 1             1     U 28/09/24
#> 2             1     U 28/09/24
#> 3             1     U 28/09/24
#> 4             2     U 28/09/24
#> 5             1     U 28/09/24
#> 6             1     U 28/09/24

dtSampling_prn$sampledf %>% nrow
#> [1] 100

Sampling Details

head(dtSampling_prn$details)
#>   kdprov strata_kabkot jml_kabkota n_alloc n_primary     arand npop        ar
#> 1     11             1          18       4         3 0.7694608   18 0.7694608
#> 2     11             2           5       4         1 0.3419454    5 0.3419454
#> 3     12             1          25       5         3 0.4248808   25 0.4248808
#> 4     12             2           8       5         2 0.6358115    8 0.6358115
#> 5     13             1          12       3         2 0.5522689   12 0.5522689
#> 6     13             2           7       3         1 0.3940859    7 0.3940859
#>          k n_deficit n_selected
#> 1 6.000000         0          3
#> 2 5.000000         0          1
#> 3 8.333333         0          3
#> 4 4.000000         0          2
#> 5 6.000000         0          2
#> 6 7.000000         0          1

Allocate predetermined allocations to smaller levels

One of the supporting functions in the samplingin package is get_allocation. This function aims to allocate sample allocations to lower levels using the proportional allocation method based on the square root of the specified variable.

For example, sample allocations are available at the Province level, which will be allocated to lower levels such as Districts/Cities using the proportional allocation method based on the square root of the total population (Total).

set.seed(242)
alokasi_prov = alokasi_dt %>%
  select(-jml_kabkota, -n_primary) %>%
  mutate(init_alloc = as.integer(runif(n(), 100, 200))) %>%
  as.data.frame()

alokasi_prov %>% head(10)
#>    kdprov init_alloc
#> 1      11        178
#> 2      12        100
#> 3      13        133
#> 4      14        168
#> 5      15        165
#> 6      16        176
#> 7      17        175
#> 8      18        192
#> 9      19        102
#> 10     21        164

alokasi_prov %>% 
  summarise(sum(init_alloc))
#>   sum(init_alloc)
#> 1            5168

alokasi_kab = pop_dt %>%
  left_join(alokasi_prov) %>%
  get_allocation(n_alloc = "init_alloc", group = c("kdprov"), pop_var = "Total") %>%
  as.data.frame()
#> Joining with `by = join_by(kdprov)`

alokasi_kab %>% head(10)
#>    idkab kdprov kdkab nmprov         nmkab Laki-laki Perempuan  Total
#> 1   1101     11    01   ACEH      SIMEULUE     47630     45235  92865
#> 2   1102     11    02   ACEH  ACEH SINGKIL     63978     62536 126514
#> 3   1103     11    03   ACEH  ACEH SELATAN    116542    115872 232414
#> 4   1104     11    04   ACEH ACEH TENGGARA    110799    110061 220860
#> 5   1105     11    05   ACEH    ACEH TIMUR    212286    210115 422401
#> 6   1106     11    06   ACEH   ACEH TENGAH    109262    106314 215576
#> 7   1107     11    07   ACEH    ACEH BARAT    100492     98244 198736
#> 8   1108     11    08   ACEH    ACEH BESAR    204428    201107 405535
#> 9   1109     11    09   ACEH         PIDIE    215878    219397 435275
#> 10  1110     11    10   ACEH       BIREUEN    215282    221136 436418
#>    init_alloc n_primary
#> 1         178         5
#> 2         178         6
#> 3         178         8
#> 4         178         8
#> 5         178        11
#> 6         178         8
#> 7         178         8
#> 8         178        11
#> 9         178        11
#> 10        178        11

alokasi_kab %>% summarise(sum(n_primary))
#>   sum(n_primary)
#> 1           5168

alokasi_kab %>% 
  group_by(kdprov) %>% 
  summarise(sum(n_primary))
#> # A tibble: 34 x 2
#>   kdprov `sum(n_primary)`
#>   <chr>             <dbl>
#> 1 11                  178
#> 2 12                  100
#> 3 13                  133
#> 4 14                  168
#> # ... with 30 more rows

# check 

all.equal(
  alokasi_prov, alokasi_kab %>% 
  group_by(kdprov) %>% 
  summarise(init_alloc=sum(n_primary)) %>% 
  ungroup() %>% 
  as.data.frame()
)
#> [1] TRUE