The privacyR package helps you anonymize sensitive data
in healthcare and research datasets. It provides tools to protect
patient privacy while keeping your data useful for analysis.
# Install from CRAN
install.packages("privacyR")Anonymize patient IDs while keeping referential integrity (same IDs get the same anonymized value):
library(privacyR)
# Original patient IDs
patient_ids <- c("P001", "P002", "P003", "P001", "P002")
print(patient_ids)
#> [1] "P001" "P002" "P003" "P001" "P002"
# Anonymize IDs
anonymized_ids <- anonymize_id(patient_ids, seed = 123)
print(anonymized_ids)
#> P001 P002 P003 P001 P002
#> "ID_uhjsB46B" "ID_4cQBb8Pk" "ID_meDL69Fd" "ID_uhjsB46B" "ID_4cQBb8Pk"
# Note: Same original IDs map to same anonymized IDs# Original names
names <- c("John Doe", "Jane Smith", "Bob Johnson", "John Doe")
print(names)
#> [1] "John Doe" "Jane Smith" "Bob Johnson" "John Doe"
# Anonymize names
anonymized_names <- anonymize_names(names, seed = 123)
print(anonymized_names)
#> John Doe Jane Smith Bob Johnson John Doe
#> "Patient_NDg84Pt8" "Patient_ufNUJoZm" "Patient_ADANaKPi" "Patient_NDg84Pt8"Two methods are available: shifting or rounding.
Shifting moves all dates by the same amount, preserving relative time differences:
# Original dates
dates <- as.Date(c("2020-01-15", "2020-03-20", "2020-06-10"))
print(dates)
#> [1] "2020-01-15" "2020-03-20" "2020-06-10"
# Shift dates
shifted_dates <- anonymize_dates(dates, method = "shift", seed = 123)
print(shifted_dates)
#> [1] "2020-03-04" "2020-05-08" "2020-07-29"
# Relative differences are preserved
diff_original <- as.numeric(dates[2] - dates[1])
diff_shifted <- as.numeric(shifted_dates[2] - shifted_dates[1])
cat("Original difference:", diff_original, "days\n")
#> Original difference: 65 days
cat("Shifted difference:", diff_shifted, "days\n")
#> Shifted difference: 65 daysRounding reduces precision by grouping dates into buckets (day, week, month, year, etc.):
# Round to month
rounded_month <- anonymize_dates(dates, method = "round",
granularity = "month", seed = 123)
print(rounded_month)
#> [1] "2020-01-01" "2020-03-01" "2020-06-01"
# Round to year
rounded_year <- anonymize_dates(dates, method = "round",
granularity = "year", seed = 123)
print(rounded_year)
#> [1] "2020-01-01" "2020-01-01" "2020-01-01"# Original locations
locations <- c("New York, NY", "Los Angeles, CA", "Chicago, IL",
"New York, NY")
print(locations)
#> [1] "New York, NY" "Los Angeles, CA" "Chicago, IL" "New York, NY"
# Generalize locations
generalized <- anonymize_locations(locations, method = "generalize", seed = 123)
print(generalized)
#> New York, NY Los Angeles, CA Chicago, IL New York, NY
#> "Location_ggyQKpzW" "Location_hybAy7Aq" "Location_DpYfTU6h" "Location_ggyQKpzW"
# Or remove locations entirely
removed <- anonymize_locations(locations, method = "remove", seed = 123)
print(removed)
#> [1] "[Location Removed]" "[Location Removed]" "[Location Removed]"
#> [4] "[Location Removed]"The anonymize_dataframe() function provides a convenient
way to anonymize entire data frames:
# Create sample patient data
patient_data <- data.frame(
patient_id = c("P001", "P002", "P003", "P001"),
name = c("John Doe", "Jane Smith", "Bob Johnson", "John Doe"),
dob = as.Date(c("1980-01-15", "1975-03-20", "1990-06-10", "1980-01-15")),
admission_date = as.Date(c("2020-01-10", "2020-02-15", "2020-03-20", "2020-01-10")),
location = c("New York, NY", "Los Angeles, CA", "Chicago, IL", "New York, NY"),
diagnosis = c("Hypertension", "Diabetes", "Hypertension", "Hypertension"),
age = c(40, 45, 30, 40)
)
print("Original data:")
#> [1] "Original data:"
print(patient_data)
#> patient_id name dob admission_date location diagnosis
#> 1 P001 John Doe 1980-01-15 2020-01-10 New York, NY Hypertension
#> 2 P002 Jane Smith 1975-03-20 2020-02-15 Los Angeles, CA Diabetes
#> 3 P003 Bob Johnson 1990-06-10 2020-03-20 Chicago, IL Hypertension
#> 4 P001 John Doe 1980-01-15 2020-01-10 New York, NY Hypertension
#> age
#> 1 40
#> 2 45
#> 3 30
#> 4 40
# Anonymize the entire data frame
anonymized_data <- anonymize_dataframe(patient_data, seed = 123)
print("\nAnonymized data:")
#> [1] "\nAnonymized data:"
print(anonymized_data)
#> patient_id name dob admission_date location
#> 1 ID_yy4R3rcM Patient_az6MEMKn 1980-10-20 2020-10-15 Location_j5VxZULh
#> 2 ID_wegx46wW Patient_VrFXRGmZ 1975-12-24 2020-11-20 Location_DDYWM3yf
#> 3 ID_2wEutqA2 Patient_ipQ2Y8Di 1991-03-16 2020-12-24 Location_xW5ux6EG
#> 4 ID_yy4R3rcM Patient_az6MEMKn 1980-10-20 2020-10-15 Location_j5VxZULh
#> diagnosis age
#> 1 Hypertension 40-49
#> 2 Diabetes 40-49
#> 3 Hypertension 30-39
#> 4 Hypertension 40-49By default, anonymize_dataframe() automatically detects
columns based on naming patterns and data types:
# The function automatically detects:
# - ID columns: patient_id, subject_id, etc.
# - Name columns: name, patient_name, etc.
# - Date columns: date, dob, admission_date, etc.
# - Location columns: location, address, city, etc.
# You can also manually specify columns
manual_anon <- anonymize_dataframe(
patient_data,
id_cols = "patient_id",
name_cols = "name",
date_cols = c("dob", "admission_date"),
location_cols = "location",
auto_detect = FALSE,
seed = 123
)Seeds and reproducibility:
seed parameter is optional (default:
NULL). When seed = NULL, the package still
maintains referential integrity using a deterministic hash-based
approach, so same inputs always produce same outputs.anonymized <- anonymize_dataframe(data, seed = 12345)Referential integrity is maintained
automatically - same original values get the same anonymized values,
which preserves relationships in your data. This works even when
seed = NULL.
Date anonymization:
Location anonymization:
Validate your results - make sure anonymized data still works for your analysis.
Keep in mind:
For more information, see the package documentation:
?anonymize_dataframe
help(package = "privacyR")