library(SCDB)
#> 
#> Attaching package: 'SCDB'
#> The following object is masked from 'package:base':
#> 
#>     nrowThe basic principle of the SCDB package is to enable the user to easily implement and maintain a database of time-versioned data.
In practice, this is done by labeling each record in the data with three additional fields:
This strategy of time versioning is often called “type 2” history (Kimball and Ross 2013).
The SCDB package provides the function update_snapshot
to handle the insertion and deactivation of records using this strategy.
It further includes several functions to improve the Quality of life for
working with database data.
A simple example of usage is shown below.
 For this example, we
use a temporary, on-disk SQLite database. Note that
get_connection() tries to establish connection using
DBI::dbConnect() with as few additional arguments as
possible. Different drivers may require authentication which can be read
from a configuration file.1
Our example data is datasets::mtcars reduced to only two
columns: row names converted to a column car, and
hp
example_data <- dplyr::tbl(conn, DBI::Id(table = "example_data"))
example_data
#> # Source:   table<`example_data`> [?? x 2]
#> # Database: sqlite 3.47.1 [:memory:]
#>    car                  hp
#>    <chr>             <dbl>
#>  1 Mazda RX4           110
#>  2 Mazda RX4 Wag       110
#>  3 Datsun 710           93
#>  4 Hornet 4 Drive      110
#>  5 Hornet Sportabout   175
#>  6 Valiant             105
#>  7 Duster 360          245
#>  8 Merc 240D            62
#>  9 Merc 230             95
#> 10 Merc 280            123
#> # ℹ more rowsImagine on Day 1, in this case January 1st, 2020, our currently
available data is the first three records of the
example_data. We then store this data in a table
mtcars:
data <- head(example_data, 3)
update_snapshot(
  .data = data,
  conn = conn,
  db_table = "mtcars", # the name of the DB table to store the data in
  timestamp = as.POSIXct("2020-01-01 11:00:00")
)
#> NULLWe can then access out data using the get_table()
function, and include information on data validity period using
include_slice_info = TRUE:
get_table(conn, "mtcars")
#> # Source:   SQL [?? x 2]
#> # Database: sqlite 3.47.1 [:memory:]
#>   car              hp
#>   <chr>         <dbl>
#> 1 Mazda RX4       110
#> 2 Mazda RX4 Wag   110
#> 3 Datsun 710       93
get_table(conn, "mtcars", include_slice_info = TRUE)
#> # Source:   SQL [?? x 5]
#> # Database: sqlite 3.47.1 [:memory:]
#>   car              hp checksum                         from_ts          until_ts
#>   <chr>         <dbl> <chr>                            <chr>               <dbl>
#> 1 Mazda RX4       110 7cbe488757cc85aab6583dbc4226bf68 2020-01-01 11:0…       NA
#> 2 Mazda RX4 Wag   110 b82618e7f5dd30d5df68540cecc696c8 2020-01-01 11:0…       NA
#> 3 Datsun 710       93 08c864e3854eb5a1460d87b3360d636f 2020-01-01 11:0…       NANote that where e.g. dplyr::tbl() requires a more exact
specification of the table identity
(tbl(conn, DBI::Id(table = "mtcars"))),
get_table() will parse any character to a
DBI::Id() object input using id().
The following day, the current data is now the first five rows of our
example data. We then store this data in the database using
update_snapshot():
# Let's say that the next day, our data set is now the first 5 of our example data
data <- head(example_data, 5)
update_snapshot(
  .data = data,
  conn = conn,
  db_table = "mtcars", # the name of the DB table to store the data in
  timestamp = as.POSIXct("2020-01-02 12:00:00")
)
#> NULLWe can again use the get_table() function to see the
latest available data, including time-keeping with
include_slice_info = TRUE:
get_table(conn, "mtcars")
#> # Source:   SQL [?? x 2]
#> # Database: sqlite 3.47.1 [:memory:]
#>   car                  hp
#>   <chr>             <dbl>
#> 1 Mazda RX4           110
#> 2 Mazda RX4 Wag       110
#> 3 Datsun 710           93
#> 4 Hornet 4 Drive      110
#> 5 Hornet Sportabout   175
get_table(conn, "mtcars", include_slice_info = TRUE)
#> # Source:   SQL [?? x 5]
#> # Database: sqlite 3.47.1 [:memory:]
#>   car                  hp checksum                         from_ts      until_ts
#>   <chr>             <dbl> <chr>                            <chr>           <dbl>
#> 1 Mazda RX4           110 7cbe488757cc85aab6583dbc4226bf68 2020-01-01 …       NA
#> 2 Mazda RX4 Wag       110 b82618e7f5dd30d5df68540cecc696c8 2020-01-01 …       NA
#> 3 Datsun 710           93 08c864e3854eb5a1460d87b3360d636f 2020-01-01 …       NA
#> 4 Hornet 4 Drive      110 3c1b6c43b206dd93ee4f6c3d06e1b416 2020-01-02 …       NA
#> 5 Hornet Sportabout   175 9355ed7a70e3ff73a4b6ee7f7129aa35 2020-01-02 …       NASince our data is time-versioned, we can recover the data from the day before
get_table(conn, "mtcars", slice_ts = "2020-01-01 11:00:00")
#> # Source:   SQL [?? x 2]
#> # Database: sqlite 3.47.1 [:memory:]
#>   car              hp
#>   <chr>         <dbl>
#> 1 Mazda RX4       110
#> 2 Mazda RX4 Wag   110
#> 3 Datsun 710       93On day 3, we imagine that we have the same 5 records, but one of them is altered
data <- head(example_data, 5) %>%
  dplyr::mutate(hp = ifelse(car == "Mazda RX4", hp / 2, hp))
update_snapshot(
  .data = data,
  conn = conn,
  db_table = "mtcars", # the name of the DB table to store the data in
  timestamp = as.POSIXct("2020-01-03 10:00:00")
)
#> NULLWe can again access our data using the get_table()
function and see that the currently available data (with the changed hp
value for Mazda RX4)
get_table(conn, "mtcars")
#> # Source:   SQL [?? x 2]
#> # Database: sqlite 3.47.1 [:memory:]
#>   car                  hp
#>   <chr>             <dbl>
#> 1 Mazda RX4 Wag       110
#> 2 Datsun 710           93
#> 3 Hornet 4 Drive      110
#> 4 Hornet Sportabout   175
#> 5 Mazda RX4            55Finally, using slice_ts = NULL, the full history (and
time-keeping information) is returned:
get_table(conn, "mtcars", slice_ts = NULL)
#> # Source:   table<`main`.`mtcars`> [?? x 5]
#> # Database: sqlite 3.47.1 [:memory:]
#>   car                  hp checksum                         from_ts      until_ts
#>   <chr>             <dbl> <chr>                            <chr>        <chr>   
#> 1 Mazda RX4           110 7cbe488757cc85aab6583dbc4226bf68 2020-01-01 … 2020-01…
#> 2 Mazda RX4 Wag       110 b82618e7f5dd30d5df68540cecc696c8 2020-01-01 … <NA>    
#> 3 Datsun 710           93 08c864e3854eb5a1460d87b3360d636f 2020-01-01 … <NA>    
#> 4 Hornet 4 Drive      110 3c1b6c43b206dd93ee4f6c3d06e1b416 2020-01-02 … <NA>    
#> 5 Hornet Sportabout   175 9355ed7a70e3ff73a4b6ee7f7129aa35 2020-01-02 … <NA>    
#> 6 Mazda RX4            55 1232f78f7befb3a765b91176eaacdbb0 2020-01-03 … <NA> In the context of the SCDB package, this is most
notably RPostgres::Postgres(), which may read from a
.pgpass file. See also the PostgreSQL
documentation.↩︎