#'   WOE Transformation
#'
#' \code{woe_trans} is for transforming data to woe.
#' The \code{woe_trans_all} function is a simpler wrapper for \code{woe_trans}.
#' @param dat A data.frame with independent variables.
#' @param target The name of target variable. Default is NULL.
#' @param x_list A list of x variables.
#' @param x  The name of an independent variable.
#' @param ex_cols Names of excluded variables. Regular expressions can also be used to match variable names. Default is NULL.
#' @param breaks_list  A list contains breaks of variables. it is generated by code{\link{get_breaks_all}},code{\link{get_breaks}}
#' @param bins_table A table contians woe of each bin of variables, it is generated by code{\link{get_bins_table_all}},code{\link{get_bins_table}}
#' @param note   Logical, outputs info. Default is TRUE.
#' @param parallel Logical, parallel computing. Default is FALSE.
#' @param woe_name Logical. Add "_woe" at the end of the variable name.
#' @param save_data Logical, save results in locally specified folder. Default is TRUE
#' @param file_name The name for periodically saved woe file. Default is "dat_woe".
#' @param dir_path The path for periodically saved woe file Default is "./data"
#' @param ...  Additional parameters.
#' @return A list of breaks for each variables.
#' @seealso \code{\link{get_tree_breaks}}, \code{\link{cut_equal}}, \code{\link{select_best_class}}, \code{\link{select_best_breaks}}
#' @examples
#' sub = cv_split(UCICreditCard, k = 30)[[1]]
#' dat = UCICreditCard[sub,]
#' dat = re_name(dat, "default.payment.next.month", "target")
#' dat = data_cleansing(dat, target = "target", obs_id = "ID", occur_time = "apply_date",
#' miss_values =  list("", -1))
#' 
#' train_test <- train_test_split(dat, split_type = "OOT", prop = 0.7,
#'                                 occur_time = "apply_date")
#' dat_train = train_test$train
#' dat_test = train_test$test
#' #get breaks of all predictive variables
#' x_list = c("PAY_0", "LIMIT_BAL", "PAY_AMT5", "EDUCATION", "PAY_3", "PAY_2")
#' breaks_list <- get_breaks_all(dat = dat_train, target = "target",
#'                               x_list = x_list, occur_time = "apply_date", ex_cols = "ID", 
#' save_data = FALSE, note  = FALSE)
#' #woe transform
#' train_woe = woe_trans_all(dat = dat_train,
#'                           target = "target",
#'                           breaks_list = breaks_list,
#'                           woe_name = FALSE)
#' test_woe = woe_trans_all(dat = dat_test,
#'                        target = "target",
#'                          breaks_list = breaks_list,
#'                          note = FALSE)
#'
#' @export

woe_trans_all <- function(dat, x_list = NULL, ex_cols = NULL, bins_table = NULL,
                          target = NULL, breaks_list = NULL, note = FALSE,
                          save_data = FALSE, parallel = FALSE, woe_name = FALSE,
                          file_name = NULL, dir_path = tempdir(), ...) {

    if (note) {
        cat(paste("[NOTE]  converting all variables to woe...."), "\n")
    }

    opt = options(stringsAsFactors = FALSE) #
    if (is.null(x_list)) {
        if (!is.null(bins_table)) {
            x_list = unique(bins_table[which(as.character(bins_table[, "Feature"]) != "Total"), "Feature"])
        } else {
            x_list = get_names(dat = dat,
                               types = c('factor', 'character', 'numeric', 'integer', 'double'),
                               ex_cols = c(target, ex_cols), get_ex = FALSE)
        }
    }
    ex_vars = get_names(dat = dat, types = c('factor', 'character', 'numeric', 'integer', 'double'),
                        ex_cols = x_list, get_ex = FALSE)
    dat_woe = loop_function(func = woe_trans, x_list = x_list,
                            args = list(dat = dat, bins_table = bins_table,
                                        target = target, breaks_list = breaks_list,
                                        woe_name = woe_name),
                            bind = "cbind", parallel = parallel)
    dat = cbind(dat[ex_vars], dat_woe)
    if (save_data) {
        dir_path = ifelse(!is.character(dir_path),
                      tempdir(), dir_path)
        if (!dir.exists(dir_path)) dir.create(dir_path)
        if (!is.character(file_name)) file_name = NULL
        save_dt(dat, file_name = ifelse(is.null(file_name), "dat.woe", paste(file_name, "dat.woe", sep = ".")), dir_path = dir_path, note = note)
    }
    options(opt) # reset
    return(dat)
}

#' @rdname woe_trans_all
#' @export

woe_trans <- function(dat, x, bins_table = NULL, target = NULL, breaks_list = NULL, woe_name = FALSE) {
    # bins_table
    if (is.null(bins_table)) {
        if (!is.null(breaks_list)) {
            bins_table = get_bins_table(dat = dat, x = x,
                                         target = target, breaks_list = breaks_list,
                                         note = FALSE)
        } else {
            stop("bins_table & breaks_list are both missing.\n")
        }
    }
    bins_tbl = bins_table[which(as.character(bins_table[, "Feature"]) == names(dat[x])),
                          c("Feature", "cuts", "bins", "woe")]
    if (woe_name) {
        woe_names = paste(names(dat[x]), "woe", sep = "_")
    } else {
        woe_names = names(dat[x])
    }
    if (length(bins_tbl) > 0 && all(as.character(bins_tbl[, "Feature"]) != "Total")) {
        bins = split_bins(dat = dat, x = x, breaks = bins_tbl[, c("cuts")], bins_no = TRUE)
        for (i in 1:length(unique(bins))) {
            dat[as.character(bins) == as.character(bins_tbl[i, "bins"]), woe_names] = bins_tbl[i, "woe"]
        }
        dat[, woe_names] = as.numeric(dat[, woe_names])
    }
    return(dat[woe_names])
}

#' One-Hot Encoding
#'
#' \code{one_hot_encoding} is for converting the factor or character variables into multiple columns
#' @param dat A dat frame.
#' @param cat_vars The name or Column index list to be one_hot encoded.
#' @param merge_cat Logical. If TRUE, to merge categories greater than 8, default is TRUE.
#' @param ex_cols  Variables to be  excluded, use regular expression matching
#' @param na_act Logical,If true, the missing value is processed, if FALSE missing value is omitted .
#' @param note Logical.Outputs info.Default is TRUE.
#' @return A dat frame with the one hot encoding applied to all the variables with type as factor or character.
#' @seealso \code{\link{de_one_hot_encoding}}
#' @examples
#' dat1 = one_hot_encoding(dat = UCICreditCard,
#' cat_vars = c("SEX", "MARRIAGE"),
#' merge_cat = TRUE, na_act = TRUE)
#' dat2 = de_one_hot_encoding(dat_one_hot = dat1,
#' cat_vars = c("SEX","MARRIAGE"), na_act = FALSE)
#'
#' @export
one_hot_encoding = function(dat, cat_vars = NULL, ex_cols = NULL,
                            merge_cat = TRUE, na_act = TRUE, note = FALSE) {
    if (note) cat("[NOTE]  one-hot encoding for charactor or factor.\n")
    if (class(dat)[1] != "data.frame") {
        dat <- as.data.frame(dat)
    }
    if (is.null(cat_vars)) {
        cat_vars <- get_names(dat = dat, types = c("character", "factor"), ex_cols = ex_cols)
    }
    if (length(cat_vars) > 0) {
        if (na_act) {
            dat[, cat_vars] = process_nas(dat[cat_vars], note = FALSE)
        }
        if (merge_cat) {
            dat[, cat_vars] = merge_category(dat[cat_vars], note = FALSE)
        }
        for (i in cat_vars) {
            if (is.factor(dat[, i]) || is.character(dat[, i])) {
                col_name = i
                dat[, i] = sapply(dat[, i], function(x) gsub(" |\"|\\$|\\*|\\?|\\[|\\^|\\{|\\}|\\\\|\\(|\\)|\\|\\)|\\]|\\.|\\-", "_", x))
                cat_list <- unique(dat[, i])
                encode_cols <- length(cat_list)
                #Create individual column for every unique value in the variable
                for (j in 1:encode_cols) {
                    one_hot_name <- (paste(col_name, ".", cat_list[j], ".", sep = ""))
                    dat[, one_hot_name] <- ifelse(dat[, i] == cat_list[j] & !is.na(dat[, i]), 1, 0)
                }
            }
        }
        dat = dat[, - which(names(dat) %in% cat_vars)]
    }
    return(dat)
}


#' Recovery One-Hot Encoding
#'
#' \code{de_one_hot_encoding} is for one-hot encoding recovery processing
#' @param dat_one_hot A dat frame with the one hot encoding variables
#' @param cat_vars  variables to be recovery processed, default is null, if null, find these variables through regular expressions .
#' @param na_act Logical,If true, the missing value is  assigned as "Missing", if FALSE missing value is omitted, the default is TRUE.
#' @param note Logical.Outputs info.Default is TRUE.
#' @return A dat frame with the one hot encoding recorery character variables
#' @seealso \code{\link{one_hot_encoding}}
#' @examples
#' #one hot encoding
#' dat1 = one_hot_encoding(dat = UCICreditCard,
#' cat_vars = c("SEX", "MARRIAGE"),
#' merge_cat = TRUE, na_act = TRUE)
#' #de one hot encoding
#' dat2 = de_one_hot_encoding(dat_one_hot = dat1,
#' cat_vars = c("SEX","MARRIAGE"),
#' na_act = FALSE)
#' @export

de_one_hot_encoding = function(dat_one_hot, cat_vars = NULL, na_act = TRUE, note = FALSE) {
    if (note) cat("[NOTE] recovery one-hot encoding for charactor or factor.\n")
    if (class(dat_one_hot)[1] != "data.frame") {
        dat_one_hot <- as.data.frame(dat_one_hot)
    }

    if (is.null(cat_vars)) {
        char_names = one_hot_names = one_hot_names = c()
        for (i in 1:length(dat_one_hot)) {
            char_names[i] <- sub(paste0("\\.$"), "", colnames(dat_one_hot)[i])
            if (!is.null(char_names[i]) && !is.na(char_names[i]) &&
                char_names[i] == colnames(dat_one_hot)[i]) {
                char_names[i] <- NA
            }
            one_hot_names[i] <- try(strsplit(char_names[i], "[.]")[[1]][1], silent = TRUE)
        }
        cat_vars <- unique(one_hot_names[!is.na(one_hot_names)])
    }

    one_hot_vars <- unlist(sapply(cat_vars, function(x) grep(paste0(x, "\\.", "\\S{1,100}", "\\."),
                                                             paste(colnames(dat_one_hot)))))

    de_cat_vars = intersect(cat_vars, unique(gsub("\\d{1}$", "", names(one_hot_vars))))

    if (length(de_cat_vars) > 0) {
        dat_one_hot[, de_cat_vars] <- lapply(de_cat_vars, function(x) {
            grx = cv_cols = names_1 = re_code = NULL
            grx = paste0(x, "\\.", "\\S{1,100}", "\\.$")
            cv_cols = grep(grx, paste(colnames(dat_one_hot)))
            names_1 = colnames(dat_one_hot)[cv_cols]
            if (na_act) {
                re_code = rep("other", nrow(dat_one_hot))
            } else {
                re_code = rep(NA, nrow(dat_one_hot))
            }
            for (i in 1:(length(names_1))) {
                re_code[which(dat_one_hot[cv_cols][i] == 1)] = strsplit(names_1[i], "[.]")[[1]][2]
            }
            return(re_code)
        }
    )
        names(dat_one_hot[, de_cat_vars]) = de_cat_vars
        dat_one_hot = data.frame(dat_one_hot, stringsAsFactors = FALSE)[, - one_hot_vars]
    }
    return(dat_one_hot)
}


#' Time Format Transfering
#'
#' \code{time_transfer} is for transfering time variables to time format.
#' @param dat A data frame
#' @param date_cols  Names of time variable or regular expressions for finding time variables. Default is  "DATE$|time$|date$|timestamp$|stamp$".
#' @param ex_cols Names of excluded variables. Regular expressions can also be used to match variable names. Default is NULL.
#' @param note   Logical, outputs info. Default is TRUE.
#' @return  A data.frame with transfermed time variables.
#' @examples
#' #transfer a variable.
#' dat = time_transfer(dat = lendingclub,date_cols = "issue_d")
#' class(dat[,"issue_d"])
#' #transfer a group of variables with similar name.
#' dat = time_transfer(dat = lendingclub,date_cols = "_d$")
#' class(dat[,"issue_d"])
#' #transfer all time variables.
#' dat = time_transfer(dat = lendingclub,date_cols = NULL)
#' class(dat[,"issue_d"])
#' @export

time_transfer <- function(dat, date_cols = "DATE$|time$|date$|timestamp$|stamp$",
                          ex_cols = NULL, note = FALSE) {
    dat <- checking_data(dat)
    if (note) {
        cat("[NOTE] format time variables.\n")
    }
    x_list = get_x_list(x_list = NULL, dat_train = dat, dat_test = NULL, ex_cols = ex_cols)
    date_cols1 = NULL
    if (!is.null(date_cols)) {
        date_cols1 <- names(dat[x_list])[colnames(dat[x_list]) %islike% date_cols]
    } else {
        date_cols1 = names(dat[x_list])
    }
    df_date = dat[date_cols1]
    df_date <- df_date[!colAllnas(df_date)]
    df_date = df_date[!sapply(df_date, is_date)]
    if (dim(df_date)[2] != 0) {
        df_date_cols <- names(df_date)
        t_sample <- list()
        t_len <- list()
        tryCatch({
            for (x in 1:ncol(df_date)) {
                t_sample[[x]] = min(unlist(lapply(as.character(sample(na.omit(df_date[[x]]), 1)),
                                              function(i) {
                                                  if (nchar(i) >= 8) { nchar(i) } else { 0 }
                                                  })), na.rm = T)
                t_len[[x]] = unlist(as.character(sample(na.omit(df_date[[x]]), 1)))
            }
        }, error = function(e) { cat("ERROR :", conditionMessage(e), "\n") },
    warning = function(w) { "" })
        date_cols2 = which(t_sample != 0)
        for (x in date_cols2) {
            if (t_sample[[x]] >= 8 & t_sample[[x]] <= 10 &
                grepl(pattern = "^[2]{1}[0]{1}[0-3]{1}[0-9]{1}-[0-9]{1}-[0-9]{1,2}$|^[2]{1}[0]{1}[0-3]{1}[0-9]{1}-[0-9]{1,2}-[0-3]{1}[0-9]{1}$|^[1]{1}[9]{1}[0-9]{2}-[0-9]{1,2}-[0-3]{1}[0-9]{1}$", x = gsub(" ", "", substr(t_len[[x]], 1, 10)))) {
                df_date[[x]] = as.Date(as.character(df_date[[x]]), "%Y-%m-%d")
            }
            if (t_sample[[x]] > 10 & grepl(pattern = "^[2]{1}[0]{1}[0-3]{1}[0-9]{1}-[0-9]{1}-[0-3]{1}[0-9]{1,2}$|^[2]{1}[0]{1}[0-3]{1}[0-9]{1}-[0-9]{1,2}-[0-3]{1}[0-9]{1}$|^[1]{1}[9]{1}[0-9]{2}-[0-9]{1,2}-[0-3]{1}[0-9]{1}$", x = gsub(" ", "", substr(t_len[[x]], 1, 10))) & grepl(pattern = "^[0-1]{1}[0-9]{1}:[0-9]{2}", gsub(" ", "", substr(t_len[[x]], 12, nchar(t_len[[x]]))))) {
                df_date[[x]] = as.Date(as.character(df_date[[x]]))
            }
            if (t_sample[[x]] >= 7 & t_sample[[x]] <= 9 &
                grepl(pattern = "^[2]{1}[0]{1}[0-5]{1}[0-9]{1}[0-9]{1}[0-3]{1}[0-9]{1,2}$|^[2]{1}[0]{1}[0-5]{1}[0-9]{1}[0-9]{1,2}[0-3]{1}[0-9]{1}$|^[1]{1}[9]{1}[0-9]{2}[0-9]{1,2}[0-3]{1}[0-9]{1}$", x = t_len[[x]])) {
                df_date[[x]] = as.Date(as.character(df_date[[x]]), "%Y%m%d")
            }
            if (t_sample[[x]] > 10 & grepl(pattern = "^[2]{1}[0]{1}[0-5]{1}[0-9]{1}[0-9]{1}[0-3]{1}[0-9]{1,2}$|^[2]{1}[0]{1}[0-5]{1}[0-9]{1}[0-9]{1,2}[0-3]{1}[0-9]{1}$|^[1]{1}[9]{1}[0-9]{2}[0-9]{1,2}[0-3]{1}[0-9]{1}$", x = gsub(" ", "", substr(t_len[[x]], 1, 10))) & grepl(pattern = "^[0-9]{1,2}:[0-9]{2}", gsub(" ", "", substr(t_len[[x]], 10, nchar(t_len[[x]]))))) {
                df_date[[x]] = as.Date(as.character(df_date[[x]]))
            }
            if (t_sample[[x]] >= 8 & t_sample[[x]] <= 10 &
                grepl(pattern = "^[2]{1}[0]{1}[0-3]{1}[0-9]{1}/[0-9]{1,2}/[0-9]{1,2}$|^[1]{1}[9]{1}[0-9]{2}/[0-9]{1,2}/[0-9]{1,2}$", x = gsub(" ", "", substr(t_len[[x]], 1, 10)))) {
                df_date[[x]] = as.Date(as.character(df_date[[x]]), "%Y/%m/%d")
            }
            if (t_sample[[x]] > 10 & grepl(pattern = "^[2]{1}[0]{1}[0-3]{1}[0-9]{1}/[0-9]{1,2}/[0-9]{1,2}$|^[2]{1}[0]{1}[0-3]{1}[0-9]{1}/[0-9]{1}/[0-9]{1}$|^[2]{1}[0]{1}[0-3]{1}[0-9]{1}/[0-9]{1,2}/[0-3]{1}[0-9]{1}$|^[2]{1}[0]{1}[0-3]{1}[0-9]{1}/[0-9]{1}/[0-3]{1}[0-9]{1}$|^[1]{1}[9]{1}[0-9]{2}/[0-9]{1,2}/[0-3]{1}[0-9]{1}$", x = gsub(" ", "", substr(t_len[[x]], 1, 10))) & grepl(pattern = "^[0-9]{1,2}:[0-9]{2}", gsub(" ", "", substr(t_len[[x]], 11, nchar(t_len[[x]]))))) {
                df_date[[x]] = as.POSIXct(as.character(df_date[[x]]))
            }
            if (t_sample[[x]] == 10 & grepl(pattern = "^[1]{1}[0-9]{9}", x = t_len[[x]])) {
                df_date[[x]] = as.POSIXct(as.numeric(df_date[[x]]), origin = "1970-01-01")
            }
            if (t_sample[[x]] == 13 & grepl(pattern = "^[1]{1}[0-9]{12}", x = t_len[[x]])) {
                df_date[[x]] = as.POSIXct(as.numeric(df_date[[x]]) / 1000, origin = "1970-01-01")
            }
        }
        dat[df_date_cols] <- df_date
        rm(df_date)
    } else {
        dat = dat
    }
    return(dat)
}


#' Derivation of Behavioral Variables
#'
#' This function is used for derivating behavioral variables and is not intended to be used by end user.
#'
#' @param  dat  A data.frame contained only predict variables.
#' @param  grx  Regular expressions used to match variable names.
#' @param  grx_x  Regular expression used to match a group of variable names.
#' @param  td  Number of variables to derivate.
#' @param  der  Variables to derivate
#' @param  parallel Logical, parallel computing. Default is FALSE.
#' @details  The key to creating a good model is not the power of a specific modelling technique, but the breadth and depth of derived variables that represent a higher level of knowledge about the phenomena under examination. 
#' @importFrom data.table setDT :=  rbindlist
#' @export

derived_ts_vars <- function(dat, grx, td = 12,
                            der = c("cvs", "sums", "means", "maxs", "max_mins",
                                    "time_intervals", "cnt_intervals", "total_pcts",
                                    "cum_pcts", "partial_acfs"),
                            parallel = TRUE) {
    cat(paste("derived variables of", paste(der), ". \n"))
    if (parallel) {
        parallel <- start_parallel_computing(parallel)
        stopCluster <- TRUE
    } else {
        parallel <- stopCluster <- FALSE
    }
    on.exit(if (parallel & stopCluster) stop_parallel_computing(attr(parallel, "cluster")))
    i. = NULL
    if (!parallel) {
        df_cv_list <- lapply(unlist(grx), function(grx) derived_ts(dat, grx, td = td, der = der))
        df_cv_list <- as.data.frame(Reduce("cbind", df_cv_list))
    } else {
        df_cv_list <- foreach(i. = unlist(grx),
                              .combine = "c",
                              .errorhandling = c('pass')) %dopar% {
                                  try(do.call(derived_ts, args = list(dat = dat, grx_x = i., td = td,
                                                                    der = der)), silent = TRUE)
                              }
        df_cv_list <- as.data.frame(df_cv_list)
    }
    return(df_cv_list)
}

#' @rdname derived_ts_vars
#' @importFrom stringr str_extract
#' @export


derived_ts <- function(dat = dat, grx_x = NULL, td = 12,
                       der = c("cvs", "sums", "means", "maxs", "max_mins",
                               "time_intervals", "cnt_intervals", "total_pcts",
                               "cum_pcts", "partial_acfs")) {
    setDT(dat)
    cv_cols <- grep(grx_x, paste(colnames(dat)))[1:td]
    cv_cols <- cv_cols[!is.na(cv_cols)]
    #cv_folds
    if (length(cv_cols) > 0) {
        name_n = orignal_nam = sim_nam = str_num = c()
        orignal_nam <- names(dat[, cv_cols, with = FALSE])
        str_num = as.numeric(str_extract(orignal_nam, "\\d+"))
        if (!any(is.na(str_num)) && length(str_num) == td) {
            name_n = paste(min(str_num), max(str_num), sep = "to")
        }
        sim_nam = paste(unique(lapply(1:(length(orignal_nam) - 1),
        function(x) sim_str(orignal_nam[x], orignal_nam[x + 1])))[[1]], collapse = "_")
        if (any(der == "cvs")) {
            dat = dat[, paste(sim_nam, name_n, "_cvs", sep = "") := ifelse(rowAllnas(dat[, cv_cols, with = FALSE]), NA,
            rowCVs(dat[, cv_cols, with = FALSE], na.rm = TRUE))]
        }
        if (any(der == "sums")) {
            dat = dat[, paste(sim_nam, name_n, "sums", sep = "_") := ifelse(rowAllnas(dat[, cv_cols, with = FALSE]), NA,
            rowSums(dat[, cv_cols, with = FALSE], na.rm = TRUE))]
        }
        if (any(der == "means")) {
            dat = dat[, paste(sim_nam, name_n, "means", sep = "_") := ifelse(rowAllnas(dat[, cv_cols, with = FALSE]), NA,
            rowMeans(dat[, cv_cols, with = FALSE], na.rm = TRUE))]
        }
        if (any(der == "maxs")) {
            dat = dat[, paste(sim_nam, name_n, "maxs", sep = "_") := ifelse(rowAllnas(dat[, cv_cols, with = FALSE]), NA,
            rowMaxs(dat[, cv_cols, with = FALSE]))]
        }

        if (any(der == "max_mins")) {
            dat = dat[, paste(sim_nam, name_n, "max_mins", sep = "_") := ifelse(rowAllnas(dat[, cv_cols, with = FALSE]), NA,
            rowMaxMins(dat[, cv_cols, with = FALSE], na.rm = TRUE))]
        }
        if (any(der == "partial_acfs")) {
            dat = dat[, paste(sim_nam, name_n, "partial_acfs", sep = "_") := derived_partial_acf(dat[, cv_cols, with = FALSE])]
        }
        if (any(der == "time_intervals")) {
            dat = dat[, paste(orignal_nam, "time_intervals", sep = "_") := derived_interval(dat[, cv_cols, with = FALSE],
            interval_type = "time_interval")]
        }
        if (any(der == "cnt_intervals")) {
            dat = dat[, paste(orignal_nam, "cnt_intervals", sep = "_") := derived_interval(dat[, cv_cols, with = FALSE],
            interval_type = "cnt_interval")]
        }
        if (any(der == "total_pcts")) {
            dat = dat[, paste(orignal_nam, "total_pcts", sep = "_") := derived_pct(dat[, cv_cols, with = FALSE],
            pct_type = "total_pct")]
        }
        if (any(der == "cum_pcts")) {
            dat = dat[, paste(orignal_nam, "cum_pcts", sep = "_") := derived_pct(dat[, cv_cols, with = FALSE],
            pct_type = "cum_pct")]
        }
    }
    return(dat)
}



#' Processing of Time or Date Variables
#'
#' This function is not intended to be used by end user.
#'
#' @param  df_tm  A data.frame
#' @param  x  Time variable.
#' @param  enddate  End time.
#' @export
#' @importFrom data.table  hour setnames
time_vars_process <- function(df_tm = df_tm, x, enddate = "occur_time") {
    if ((class(df_tm[[x]])[1] == 'POSIXct' | class(df_tm[[x]])[1] == 'Date') & x != enddate) {
        mydata <- within(df_tm, {
            new = as.numeric(df_tm[[enddate]] - as.Date(df_tm[[x]]))
            new2 = ifelse(is.na(df_tm[[x]]) == TRUE, 0, 1)
            new3 = hour(round(as.POSIXct(df_tm[[x]])))
        })
        new_name <- c(paste(x, "_", enddate, "_duration", sep = ""),
                      paste(x, "IF", sep = "_"),
                      paste(x, "hours", sep = "_"))
        setnames(mydata, c("new", "new2", "new3"), c(new_name))
        return(mydata[, new_name])
    }
}


#' time_varieble
#'
#' This function is not intended to be used by end user.
#'
#' @param  dat  A data.frame.
#' @param  date_cols  Time variables.
#' @param  enddate  End time.
#' @export
time_varieble <- function(dat, date_cols = NULL, enddate = NULL) {

    dat = checking_data(dat = dat)
    date_cols1 = NULL
    if (!is.null(date_cols)) {
        date_cols1 <- names(dat)[colnames(dat) %islike% c(enddate, date_cols)]
    } else {
        date_cols1 = names(dat)
    }
    df_date = dat[date_cols1]
    df_date = time_transfer(dat = df_date, date_cols = c(enddate, date_cols))
    df_date <- df_date[!colAllnas(df_date)]
    df_tm = df_date[sapply(df_date, is_date)]

    time_vars_list <- lapply(date_cols1, function(x) time_vars_process(df_tm = df_tm, x, enddate = enddate))
    index <- 0;
    j <- 1
    for (i in 1:length(time_vars_list)) {
        if (is.null(time_vars_list[[i]])) {
            index[j] <- i
            j <- j + 1
        }
    }
    tm_vars_tbl <- as.data.frame(Reduce("cbind", time_vars_list[-index]))

    return(tm_vars_tbl)
}


#' Processing of Address Variables
#'
#' This function is not intended to be used by end user.
#'
#' @param df_city A data.frame.
#' @param x Variables of city,
#' @param city_class  Class or levels of cities.
#' @export
city_varieble_process <- function(df_city, x, city_class) {
    if (class(df_city)[1] != "data.frame") {
        df_city <- as.data.frame(df_city)
    }
    df_city <- within(df_city, {
        city_level <- NA
        city_level[df_city[[x]] %alike% city_class[1]] <- 1
        city_level[df_city[[x]] %alike% city_class[2]] <- 2
        city_level[df_city[[x]] %alike% city_class[3]] <- 3
        city_level[df_city[[x]] %alike% city_class[4]] <- 4
        city_level[df_city[[x]] %alike% city_class[5]] <- 5
        city_level[df_city[[x]] %alike% city_class[6]] <- 6
        city_level[is.null(df_city[[x]]) == TRUE | df_city[[x]] == "NULL" | df_city[[x]] == "" |
            df_city[[x]] == "Missing" | city_level == "NA" | df_city[[x]] == "NA"] <- -1
        city_level[is.na(city_level)] <- -1
    })
    NAsRate <- length(which(df_city$city_level == -1)) / nrow(df_city)
    if (NAsRate >= 0.3 & NAsRate < 0.6) {
        df_city2 <- data.frame()
        df_city2 <- within(df_city, {
            city_level <- NA
            city_level[df_city[[x]] %alike% city_class[1]] <- 1
            city_level[df_city[[x]] %alike% city_class[2]] <- 2
            city_level[df_city[[x]] %alike% city_class[3]] <- 3
            city_level[df_city[[x]] %alike% city_class[4]] <- 4
            city_level[df_city[[x]] %alike% city_class[5]] <- 4
            city_level[df_city[[x]] %alike% city_class[6]] <- 4
            city_level[is.null(df_city[[x]]) == TRUE | df_city[[x]] == "NULL" | df_city[[x]] == "" |
                df_city[[x]] == "Missing" | city_level == "NA" | df_city[[x]] == "NA"] <- -1
            city_level[is.na(city_level)] <- -1
        })
    }
    if (NAsRate >= 0.6) {
        df_city3 <- data.frame()
        df_city3 <- within(df_city, {
            city_level <- NULL
            city_level[df_city[[x]] %alike% city_class[1]] <- 1
            city_level[df_city[[x]] %alike% city_class[2]] <- 1
            city_level[df_city[[x]] %alike% city_class[3]] <- 1
            city_level[df_city[[x]] %alike% city_class[4]] <- 1
            city_level[df_city[[x]] %alike% city_class[5]] <- 1
            city_level[df_city[[x]] %alike% city_class[6]] <- 1
            city_level[is.null(df_city[[x]]) == TRUE | df_city[[x]] == "NULL" | df_city[[x]] == "" |
                df_city[[x]] == "Missing" | city_level == "NA" | df_city[[x]] == "NA"] <- -1
            city_level[is.na(city_level)] <- -1
        })
    }
    city_level_name <- paste(x, "city_level", sep = "_")
    df_city <- re_name(df_city, city_level, city_level_name)
    return(df_city[city_level_name])
}

#' city_varieble
#'
#' This function is used for city variables derivation.
#'
#' @param df  A data.frame.
#' @param city_cols Variables of city,
#' @param city_pattern  Regular expressions, used to match city variable names. Default is "city$".
#' @param city_class  Class or levels of cities.
#' @param parallel Logical, parallel computing. Default is TRUE.
#' @importFrom dplyr group_by mutate summarize  summarise n  count %>% filter left_join
#' @importFrom parallel detectCores  clusterExport clusterCall makeCluster stopCluster
#' @importFrom doParallel registerDoParallel
#' @importFrom foreach foreach %dopar% %do%  registerDoSEQ
#' @export
city_varieble <- function(df = df, city_cols = NULL,
                          city_pattern = "city$", city_class = city_class,
                          parallel = TRUE) {
    if (class(df)[1] != "data.frame") {
        df <- as.data.frame(df)
    }
    if (is.null(city_cols)) {
        city_index <- grepl(city_pattern, paste(colnames(df)))
        city_cols <- names(df[city_index])
    } else {
        city_cols <- names(df[city_cols])
    }
    df_city = df[, city_cols]
    if (parallel) {
        parallel <- start_parallel_computing(parallel)
        stopCluster <- TRUE
    } else {
        parallel <- stopCluster <- FALSE
    }
    on.exit(if (parallel & stopCluster) stop_parallel_computing(attr(parallel, "cluster")))
    i. = NULL
    df_city_list = list()
    if (!parallel) {
        df_city_list <- lapply(city_cols, function(x) city_varieble_process(df_city, x, city_class))
        df_city_tbl <- Reduce("cbind", df_city_list) %>% as.data.frame()
    } else {
        df_city_list <- foreach(i. = city_cols, .combine = "c") %dopar% {
            try(do.call(city_varieble_process,
                        args = list(df_city = df_city, x = i., city_class = city_class)),
                silent = TRUE)
        }
        df_city_tbl <- as.data.frame(df_city_list)
    }
    return(df_city_tbl)
}

#' add_variable_process
#'
#' This function is not intended to be used by end user.
#'
#' @param  add  A data.frame contained address variables.
#' @export
add_variable_process <- function(add) {
    # acquire a sets of addresses
    add1 = as.data.frame(add)
    sim1 = colname1 = list()
    for (i in 1:ncol(add1)) {
        if (i >= ncol(add1)) break
        sim1[[i]] <- apply(add1[, i:ncol(add1)], 2,
                       function(x) {
                           ifelse(add1[, i] %alike% x, 1, 0)
                       })
        colname1[[i]] = lapply(names(add1)[i:(ncol(add1))], function(n) paste(names(add1)[i], n, sep = '_WITH_'))
    }
    sim1 = data.frame(t(unlist(sim1)), stringsAsFactors = FALSE)
    names(sim1) = unlist(colname1)
    # find the variables which are computing similarity with themselves
    splitvar <- strsplit(names(sim1), "_WITH_")
    vars <- c()
    for (i in 1:(length(sim1))) {
        if (splitvar[[i]][1] == splitvar[[i]][2]) {
            vars[[i]] <- names(sim1)[i]
        } else {
            vars[[i]] <- NA
        }
    }
    # get the final results
    sim = sim1[is.na(vars)]
    simm <- as.vector(sim)
    return(simm)
}


#' address_varieble
#'
#' This function is not intended to be used by end user.
#'
#' @param df  A data.frame.
#' @param address_cols Variables of address,
#' @param address_pattern  Regular expressions, used to match address variable names.
#' @param parallel Logical, parallel computing. Default is TRUE.
#' @importFrom dplyr group_by mutate summarize  summarise n  count %>% filter left_join
#' @importFrom parallel detectCores  clusterExport clusterCall makeCluster stopCluster
#' @importFrom doParallel registerDoParallel
#' @importFrom foreach foreach %dopar% %do%  registerDoSEQ
#' @export
address_varieble <- function(df, address_cols = NULL, address_pattern = NULL, parallel = TRUE) {
    if (class(df)[1] != "data.frame") {
        df <- as.data.frame(df)
    }
    if (is.null(address_cols)) {
        address_cols <- grepl(address_pattern, paste(colnames(df)))
        address_vars = names(df)[address_cols]
    } else {
        address_vars <- names(df[address_cols])
    }
    df_add = df[address_vars]
    if (parallel) {
        parallel <- start_parallel_computing(parallel)
        stopCluster <- TRUE
    } else {
        parallel <- stopCluster <- FALSE
    }
    on.exit(if (parallel & stopCluster) stop_parallel_computing(attr(parallel, "cluster")))
    i. = NULL
    df_add_list = list()
    if (!parallel) {
        df_add_list <- lapply(1:nrow(df_add), function(i.) add_variable_process(add = df_add[i.,]))
        df_add_tbl <- Reduce("cbind", df_add_list) %>% as.data.frame()
    } else {
        df_add_list <- foreach(i. = 1:nrow(df_add), .combine = "c") %dopar% {
            try(do.call(add_variable_process, args = list(add = df_add[i.,])), silent = TRUE)
        }
        df_add_tbl <- as.data.frame(df_add_list)
    }
    return(df_add_tbl)
}



#' variable_process
#'
#' This function is not intended to be used by end user.
#'
#' @param  add  A data.frame
#' @importFrom data.table :=
#' @export
variable_process <- function(add) {
    td = new3 = new2 = grx_x = colname1 = NULL

    # acquire a sets of addresses
    cv_cols <- grep(grx_x, paste(colnames(dat)))[1:td]
    cv_cols <- cv_cols[!is.na(cv_cols)]
    #cv_folds
    if (length(cv_cols) > 0) {
        dat = dat[, new2 := ifelse(rowAllnas(dat[, cv_cols, with = FALSE]), NA,
        rowSums(dat[, cv_cols, with = FALSE], na.rm = TRUE))]
        dat = dat[, new3 := ifelse(rowAllnas(dat[, cv_cols, with = FALSE]), NA,
        rowMeans(dat[, cv_cols, with = FALSE], na.rm = TRUE))]
    }
    sim1 = data.frame(t(unlist(sim1)), stringsAsFactors = FALSE)
    names(sim1) = unlist(colname1)
    # find the variables which are computing similarity with themselves
    splitvar <- strsplit(names(sim1), "_WITH_")
    vars <- c()
    for (i in 1:(length(sim1))) {
        if (splitvar[[i]][1] == splitvar[[i]][2]) {
            vars[[i]] <- names(sim1)[i]
        } else {
            vars[[i]] <- NA
        }
    }
    # get the final results
    sim = sim1[is.na(vars)]
    simm <- as.vector(sim)
    return(simm)
}


#' derived_interval
#'
#' This function is not intended to be used by end user. 
#'
#' @param  dat_s  A data.frame contained only predict variables.
#' @param interval_type  Available of c("cnt_interval", "time_interval")
#' @export
#' @importFrom data.table first
derived_interval <- function(dat_s, interval_type = c("cnt_interval", "time_interval")) {
    interval_list <- apply(dat_s, 1, function(m) {
        if (interval_type == "time_interval") {
            cnt_ind = inter_ind = which(!is.na(m) | m != 0)
        } else {
            cnt_ind = which(m >= 0)
            inter_ind = unlist(m, use.names = FALSE)[c(cnt_ind)]
        }
        interval <- rep(NA, length(m))
        if (length(cnt_ind) > 1) {
            interval[cnt_ind] <- vapply(1:(length(inter_ind)), function(i) {

                ifelse(i <= length(inter_ind), abs(inter_ind[i] - inter_ind[i + 1]), NA)

            }, FUN.VALUE = numeric(1))
        }
        interval = c(abs(1 - data.table::first(inter_ind)), interval[-length(interval)])
        interval
    })
    interval_list = as.data.frame(t(interval_list))
    interval_list
}


#' derived_pct
#'
#' This function is not intended to be used by end user. 
#'
#' @param  dat_s  A data.frame contained only predict variables.
#' @param pct_type  Available of "total_pct"
#' @export
derived_pct <- function(dat_s, pct_type = "total_pct") {
    dat_s[is.na(dat_s)] <- 0
    if (pct_type == "total_pct") {
        pct_list = dat_s / rowSums(dat_s, na.rm = TRUE)
    } else {
        cnt_pct_list = dat_s / rowSums(dat_s, na.rm = TRUE)
        pct_list = apply(cnt_pct_list, 1, function(x) cumsum(x))
        pct_list = as.data.frame(t(pct_list))
    }

    pct_list
}


#' derived_partial_acf
#'
#' This function is not intended to be used by end user. 
#'
#' @param  dat_s  A data.frame
#' @export

derived_partial_acf <- function(dat_s) {
    dat_s[is.na(dat_s)] <- 0
    p_acf <- apply(dat_s, 1, function(x) ifelse(length(unique(x)) > 2, mean(abs(ar(ts(x), FALSE,
    length(unique(x)) - 1, na.action = na.pass)$partialacf)), NA))
    p_acf
}

#' sim_str
#'
#' This function is not intended to be used by end user. 
#'
#' @param a A string
#' @param b  A string
#' @param sep Seprater of strings. Default is "_|[.]|[A-Z]".
#' @export
sim_str <- function(a, b, sep = "_|[.]|[A-Z]") {
    intersect(strsplit(a, sep)[[1]], strsplit(b, sep)[[1]])
}


#' Recovery Percent Format
#'
#' \code{de_percent} is  a small function for recoverying percent format..
#' @param x  Character with percent formant.
#' @param digits  Number of digits.Default: 2.
#' @return  x without percent format.
#' @examples
#' de_percent("24%")
#' @export

de_percent <- function(x, digits = 2) {
    x = as.character(x)
    round(as.numeric(gsub("%", "", x)) / 100, digits = digits)
}


#' Merge Category
#'
#' \code{merge_category} is  for merging   category of nominal variables which number of categories is more than m or percent of samples in any categories is less than p.
#' @param dat A data frame with x and target.
#' @param ex_cols A list of excluded variables. Default is NULL.
#' @param p The minimum percent of samples in a category to merge.
#' @param m The minimum number of categories.
#' @param note Logical, outputs info. Default is TRUE.
#' @return  A data.frame with merged category variables.
#' @examples
#' #merge_catagory
#' dat =  merge_category(lendingclub,ex_cols = "id$|_d$")
#' char_list = get_names(dat = dat,types = c('factor', 'character'),
#' ex_cols = "id$|_d$", get_ex = FALSE)
#' str(dat[,char_list])
#' @export

merge_category <- function(dat, ex_cols = "date$|id$|time$|DATA$|ID$|TIME$", p = 0.01, m = 10, note = FALSE) {
    opt = options("warn" = -1) # suppress warnings
    if (note) {
        (cat("[NOTE] merge categories which percent is less than 0.001 or  obs number is less than 10.\n"))
    }
    char_list = get_names(dat = dat,
                          types = c('factor', 'character'),
                          ex_cols = ex_cols, get_ex = FALSE)

    for (x in char_list) {
        dt_x = table(as.character(dat[, x]), useNA = "no")
        merge_cat = which(dt_x < nrow(dat) * p)
        over_vars = order(abs(dt_x), decreasing = TRUE)[m:length(dt_x)]
        char_num = tryCatch({ as.numeric(names(dt_x)) },
                            error = function(e) { cat("ERROR :", conditionMessage(e), "\n") },
                            warning = function(w) { as.numeric(names(dt_x)) })
        char_num_ind = which(!is.na(char_num))
        if ((length(merge_cat) > 0 & length(over_vars) > 0) &&
            round(length(char_num_ind) / length(dt_x), 2) < 0.8) {
            max_class = unique(c(over_vars, merge_cat))
            dat[which(dat[, x] %in% names(dt_x[max_class])), x] = "other"
        }
    }
    options(opt) # reset warnings
    return(dat)
}

#' character to number
#'
#' \code{char_to_num} is  for transfering character variables which are actually numerical numbers containing strings  to numeric.
#' @param dat A data frame
#' @param ex_cols A list of excluded variables. Regular expressions can also be used to match variable names. Default is NULL.
#' @param note Logical, outputs info. Default is TRUE.
#' @return  A data.frame
#' @examples
#' dat_sub = lendingclub[c("mths_since_recent_revol_delinq", "mths_since_last_record")]
#' str(dat_sub)
#' #variables that are converted to numbers containing strings
#' dat_sub[is.na(dat_sub)] = "Missing"
#' str(dat_sub)
#' dat_sub = char_to_num(dat_sub)
#' str(dat_sub)
#' @export

char_to_num <- function(dat, note = FALSE, ex_cols = "date$|id$|time$|DATA$|ID$|TIME$") {
    opt = options("warn" = -1) # suppress warnings
    if (note) {
        cat("[NOTE] transfer character variables which are actually numerical to numeric.\n")
    }
    char_list = get_names(dat = dat,
                          types = c('factor', 'character'),
                          ex_cols = ex_cols, get_ex = FALSE)

    for (x in char_list) {
        dt_x = table(as.character(dat[, x]), useNA = "no")
        char_num = tryCatch({ as.numeric(names(dt_x)) },
                            error = function(e) {
                                cat("ERROR :", conditionMessage(e), "\n")
                            },
                            warning = function(w) {
                                as.numeric(names(dt_x))
                            })
        char_num_ind = which(!is.na(char_num))
        if (length(char_num_ind) > 0 && length(dt_x) > 1 && round(length(char_num_ind) / length(dt_x), 2) >= 0.8) {
            dat[, x] = as.numeric(as.character(dat[, x]))
        }
    }
    options(opt) # reset warnings
    return(dat)
}

