stRoke/R/cpr_tools.R

178 lines
5.0 KiB
R
Raw Permalink Normal View History

2022-09-22 14:20:46 +02:00
#' CPR check
#'
#' Checking validity of cpr number. Vectorised.
2023-01-11 12:54:08 +01:00
#' @param cpr cpr-numbers as ddmmyy"-."xxxx or ddmmyyxxxx.
#' Also mixed formatting. Vector or data frame column.
2022-09-22 14:20:46 +02:00
#' @keywords cpr
#'
2022-09-22 19:53:15 +02:00
#' @return Logical vector of cpr validity
2022-09-22 14:20:46 +02:00
#' @export
#'
2022-09-26 13:59:05 +02:00
#' @examples
2023-01-11 12:54:08 +01:00
#' fsd<-c("2310450637", "010190-2000", "010115-4000",
#' "300450-1030","010150-4021")
2022-09-26 13:59:05 +02:00
#' cpr_check("2310450637")
2022-09-22 14:20:46 +02:00
#' cpr_check(fsd)
#' all(cpr_check(fsd))
2023-01-12 13:44:29 +01:00
cpr_check <- function(cpr) {
2022-09-22 14:20:46 +02:00
# Check validity of CPR number, format ddmmyy-xxxx
2023-01-12 13:44:29 +01:00
# Build upon data from this document:
2023-01-11 12:54:08 +01:00
# https://cpr.dk/media/12066/personnummeret-i-cpr.pdf
2023-01-12 13:44:29 +01:00
## OBS according to new description, not all valid CPR numbers
2023-01-11 12:54:08 +01:00
## apply to this modulus 11 rule.
2022-09-22 19:53:15 +02:00
message(
2023-01-12 13:44:29 +01:00
"OBS: as per 2007 not all valid CPR numbers apply to modulus 11 rule.
\nSee the vignette 'Toolbox'"
)
2022-09-22 14:20:46 +02:00
2023-01-12 13:44:29 +01:00
str_length <- nchar(cpr)
2022-09-22 19:53:15 +02:00
# Calculating length of each element in vector
2022-09-22 14:20:46 +02:00
2023-01-12 13:44:29 +01:00
cpr_short <-
paste0(substr(cpr, 1, 6), substr(cpr, str_length - 3, str_length))
2022-09-22 19:53:15 +02:00
# Subsetting strings to first 6 and last 4 characters to short format cpr.
2022-09-22 14:20:46 +02:00
2023-01-12 13:44:29 +01:00
cpr_matrix <-
matrix(as.numeric(unlist(strsplit(cpr_short, ""))), nrow = 10)
# Splitting all strings by each character to list,
2023-01-11 12:54:08 +01:00
# unlisting and creating matrix. Default is by column.
2022-09-22 19:53:15 +02:00
2023-01-12 13:44:29 +01:00
test_vector <- c(4, 3, 2, 7, 6, 5, 4, 3, 2, 1)
# Multiplication vector from
2023-01-11 12:54:08 +01:00
# https://cpr.dk/media/12066/personnummeret-i-cpr.pdf
2022-09-22 19:53:15 +02:00
2023-01-12 13:44:29 +01:00
colSums(cpr_matrix * test_vector) %% 11 == 0
2022-09-22 19:53:15 +02:00
# Testing if modulus 11 == 0 of sums of matrix * multiplication vector.
2022-09-22 14:20:46 +02:00
}
#' Extracting date of birth from CPR
#'
2022-09-26 13:59:05 +02:00
#' For easy calculation. Does not handle cprs with letters (interim cpr)
2023-01-11 12:54:08 +01:00
#' @param cpr cpr-numbers as ddmmyy"-."xxxx or ddmmyyxxxx.
#' Also mixed formatting. Vector or data frame column.
2023-01-05 10:21:46 +01:00
#' @param format character string of dob date format. Default is "%d-%m-%Y".
2022-09-22 14:20:46 +02:00
#' @keywords cpr
2023-01-05 10:21:46 +01:00
#'
#' @return character vector
2022-09-22 14:20:46 +02:00
#' @export
#'
#' @examples
#' cpr_dob("231045-0637")
2023-01-11 12:54:08 +01:00
#' fsd<-c("2310450637", "010190-2000", "010115-4000",
#' "300450-1030","010150-4021")
2022-09-22 14:20:46 +02:00
#' cpr_dob(fsd)
2023-01-12 13:44:29 +01:00
cpr_dob <- function(cpr, format = "%d-%m-%Y") {
2022-09-22 14:20:46 +02:00
## Input as cpr-numbers in format ddmmyy-xxxx
2023-01-12 13:44:29 +01:00
## Build upon data from this document:
2023-01-11 12:54:08 +01:00
## https://cpr.dk/media/12066/personnummeret-i-cpr.pdf
# Checks format and length
2023-01-12 13:44:29 +01:00
check_form <- (nchar(cpr) == 10 & grepl("-", cpr)) |
(nchar(cpr) == 11 & !grepl("[^A-Za-z0-9]", substr(cpr, 7, 7))) |
2023-01-11 12:54:08 +01:00
!nchar(cpr) %in% 10:11
2023-01-12 13:44:29 +01:00
if (any(check_form)) {
warning("CPR length should be in format ddmmyy-xxxx or ddmmyyxxxx.
Output will contain NAs")
2023-01-11 12:54:08 +01:00
}
2023-01-12 13:44:29 +01:00
str_length <- nchar(cpr)
2023-01-11 12:54:08 +01:00
# Calculating length of each element in vector
2023-01-12 13:44:29 +01:00
cpr_short_all <-
paste0(substr(cpr, 1, 6), substr(cpr, str_length - 3, str_length))
2023-01-11 12:54:08 +01:00
# Subsetting strings to first 6 and last 4 characters to short format cpr.
# Checks if letters in other positions than 1:7 or 10
2023-01-12 13:44:29 +01:00
check_lets <- grepl("\\D", paste0(substr(cpr_short_all, 1, 7),
substr(cpr_short_all, 10, 10)))
2023-01-11 12:54:08 +01:00
if (any(check_lets)) {
2023-01-12 13:44:29 +01:00
warning(
"Does only handle CPRs with letters in position 2 and 3 of
the last 4 positions. Output will contain NAs"
)
}
2023-01-11 12:54:08 +01:00
checks_any <- check_form | check_lets
non_na <- seq_along(cpr)[!checks_any]
cpr_short <- cpr_short_all[!checks_any]
2022-09-22 14:20:46 +02:00
2023-01-12 13:44:29 +01:00
dobs <- c()
2022-09-22 14:20:46 +02:00
2023-01-12 13:44:29 +01:00
a00 <- c(0:99)
a36 <- c(0:36)
a57 <- c(0:57)
2022-09-22 19:53:15 +02:00
2023-01-12 13:44:29 +01:00
b00 <- c(0:3)
b36 <- c(4, 9)
b57 <- c(5:8)
2022-09-22 19:53:15 +02:00
2023-01-12 13:44:29 +01:00
year <- as.numeric(substr(cpr_short, 5, 6))
2022-09-22 19:53:15 +02:00
2023-01-12 13:44:29 +01:00
ddmmyy <- as.Date(substr(cpr_short, 1, 6), format = "%d%m%y")
2022-09-22 14:20:46 +02:00
2023-01-12 13:44:29 +01:00
for (i in seq_along(cpr_short)) {
2022-09-22 19:53:15 +02:00
p56 <- year[i]
2022-09-22 14:20:46 +02:00
2023-01-12 13:44:29 +01:00
p7 <- substr(cpr_short[i], 7, 7)
2022-09-22 14:20:46 +02:00
2022-09-22 19:53:15 +02:00
birth <- ddmmyy[i]
2022-09-22 14:20:46 +02:00
2023-01-12 13:44:29 +01:00
if (((p56 %in% a00) && (p7 %in% b00)))
2022-09-22 14:20:46 +02:00
{
2023-01-12 13:44:29 +01:00
dob <- as.Date(format(birth, format = "19%y%m%d"), format = "%Y%m%d")
2022-09-22 14:20:46 +02:00
}
2023-01-12 13:44:29 +01:00
else if (((p56 %in% a36) && (p7 %in% b36)))
2022-09-22 14:20:46 +02:00
{
2023-01-12 13:44:29 +01:00
dob <- as.Date(format(birth, format = "20%y%m%d"), format = "%Y%m%d")
2022-09-22 14:20:46 +02:00
}
2023-01-12 13:44:29 +01:00
else if ((!(p56 %in% a36) && (p7 %in% b36)))
2022-09-22 14:20:46 +02:00
{
2023-01-12 13:44:29 +01:00
dob <- as.Date(format(birth, format = "19%y%m%d"), format = "%Y%m%d")
2022-09-22 14:20:46 +02:00
}
2023-01-12 13:44:29 +01:00
else if (((p56 %in% a57) && (p7 %in% b57)))
2022-09-22 14:20:46 +02:00
{
2023-01-12 13:44:29 +01:00
dob <- as.Date(format(birth, format = "20%y%m%d"), format = "%Y%m%d")
2022-09-22 14:20:46 +02:00
}
2023-01-12 13:44:29 +01:00
else if ((!(p56 %in% a57) && (p7 %in% b57)))
2022-09-22 14:20:46 +02:00
{
2023-01-12 13:44:29 +01:00
dob <- as.Date(format(birth, format = "18%y%m%d"), format = "%Y%m%d")
2022-09-22 14:20:46 +02:00
}
2023-01-12 13:44:29 +01:00
dobs[i] <- dob
2022-09-22 14:20:46 +02:00
}
2023-01-12 13:44:29 +01:00
dobs <-
format(as.Date(dobs, origin = "1970-01-01"), format = format)
merge(
data.frame(index = seq_along(cpr), dobs = NA),
data.frame(index = non_na, dobs),
by = "index",
all = TRUE
)[, 3]
2022-09-22 14:20:46 +02:00
}
2022-09-22 15:45:40 +02:00
#' Determine female sex from CPR
#'
#' Just checking if last number of a string is equal or not.
2023-01-11 12:54:08 +01:00
#' @param cpr Vector. cpr-numbers as ddmmyy"-."xxxx or ddmmyyxxxx.
#' Also mixed formatting. Vector or data frame column.
2022-09-22 15:45:40 +02:00
#' @keywords cpr
#'
#' @return Logical vector
#' @export
#' @examples
#' cpr_female(stRoke::cprs[,1])
2023-01-12 13:44:29 +01:00
cpr_female <- function(cpr) {
if (!is.vector(cpr))
stop("Input has to be vector")
x <-
nchar(as.character(cpr)) # Formats as character to avoid confusions
2022-09-22 15:45:40 +02:00
2022-09-26 13:59:05 +02:00
as.integer(substr(cpr, start = x, stop = x)) %% 2 == 0
2022-09-22 15:45:40 +02:00
}