% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/setup_rank_data.R
\name{setup_rank_data}
\alias{setup_rank_data}
\title{Setup rank data}
\usage{
setup_rank_data(
  rankings = NULL,
  preferences = NULL,
  user_ids = numeric(),
  observation_frequency = NULL,
  validate_rankings = TRUE,
  na_action = c("augment", "fail", "omit"),
  cl = NULL,
  shuffle_unranked = FALSE,
  random = FALSE,
  random_limit = 8L,
  timepoint = NULL,
  n_items = NULL
)
}
\arguments{
\item{rankings}{A matrix of ranked items, of size \verb{n_assessors x n_items}.
See \code{\link[=create_ranking]{create_ranking()}} if you have an ordered set of items that need to be
converted to rankings. If \code{preferences} is provided, \code{rankings} is an
optional initial value of the rankings. If \code{rankings} has column names,
these are assumed to be the names of the items. \code{NA} values in rankings are
treated as missing data and automatically augmented; to change this
behavior, see the \code{na_action} argument to \code{\link[=set_model_options]{set_model_options()}}. A vector
length \code{n_items} is silently converted to a matrix of length \verb{1 x n_items},
and names (if any), are used as column names.}

\item{preferences}{A data frame with one row per pairwise comparison, and
columns \code{assessor}, \code{top_item}, and \code{bottom_item}. Each column contains the
following:
\itemize{
\item \code{assessor} is a numeric vector containing the assessor index.

\item \code{bottom_item} is a numeric vector containing the index of the item that
was disfavored in each pairwise comparison.

\item \code{top_item} is a numeric vector containing the index of the item that was
preferred in each pairwise comparison.
}
So if we have two assessors and five items, and assessor 1 prefers item 1
to item 2 and item 1 to item 5, while assessor 2 prefers item 3 to item 5,
we have the following \code{df}:
\tabular{rrr}{
\strong{assessor} \tab \strong{bottom_item} \tab \strong{top_item}\cr
1 \tab 2 \tab 1\cr
1 \tab 5 \tab 1\cr
2 \tab 5 \tab 3\cr
}}

\item{user_ids}{Optional \code{numeric} vector of user IDs. Only only used by
\code{\link[=update_mallows]{update_mallows()}}. If provided, new data can consist of updated partial
rankings from users already in the dataset, as described in Section 6 of
\insertCite{steinSequentialInferenceMallows2023;textual}{BayesMallows}.}

\item{observation_frequency}{A vector of observation frequencies (weights) to
apply do each row in \code{rankings}. This can speed up computation if a large
number of assessors share the same rank pattern. Defaults to \code{NULL}, which
means that each row of \code{rankings} is multiplied by 1. If provided,
\code{observation_frequency} must have the same number of elements as there are
rows in \code{rankings}, and \code{rankings} cannot be \code{NULL}. See
\code{\link[=compute_observation_frequency]{compute_observation_frequency()}} for a convenience function for computing
it.}

\item{validate_rankings}{Logical specifying whether the rankings provided (or
generated from \code{preferences}) should be validated. Defaults to \code{TRUE}.
Turning off this check will reduce computing time with a large number of
items or assessors.}

\item{na_action}{Character specifying how to deal with \code{NA} values in the
\code{rankings} matrix, if provided. Defaults to \code{"augment"}, which means that
missing values are automatically filled in using the Bayesian data
augmentation scheme described in
\insertCite{vitelli2018;textual}{BayesMallows}. The other options for this
argument are \code{"fail"}, which means that an error message is printed and the
algorithm stops if there are \code{NA}s in \code{rankings}, and \code{"omit"} which simply
deletes rows with \code{NA}s in them.}

\item{cl}{Optional computing cluster used for parallelization when generating
transitive closure based on preferences, returned from
\code{\link[parallel:makeCluster]{parallel::makeCluster()}}. Defaults to \code{NULL}.}

\item{shuffle_unranked}{Logical specifying whether or not to randomly permute
unranked items in the initial ranking. When \code{shuffle_unranked=TRUE} and
\code{random=FALSE}, all unranked items for each assessor are randomly permuted.
Otherwise, the first ordering returned by \code{igraph::topo_sort()} is
returned.}

\item{random}{Logical specifying whether or not to use a random initial
ranking. Defaults to \code{FALSE}. Setting this to \code{TRUE} means that all
possible orderings consistent with the stated pairwise preferences are
generated for each assessor, and one of them is picked at random.}

\item{random_limit}{Integer specifying the maximum number of items allowed
when all possible orderings are computed, i.e., when \code{random=TRUE}.
Defaults to \code{8L}.}

\item{timepoint}{Integer vector specifying the timepoint. Defaults to \code{NULL},
which means that a vector of ones, one for each observation, is generated.
Used by \code{\link[=update_mallows]{update_mallows()}} to identify data with a given iteration of the
sequential Monte Carlo algorithm. If not \code{NULL}, must contain one integer
for each row in \code{rankings}.}

\item{n_items}{Integer specifying the number of items. Defaults to \code{NULL},
which means that the number of items is inferred from \code{rankings} or from
\code{preferences}. Setting \code{n_items} manually can be useful with pairwise
preference data in the SMC algorithm, i.e., when \code{rankings} is \code{NULL} and
\code{preferences} is non-\code{NULL}, and contains a small number of pairwise
preferences for a subset of users and items.}
}
\value{
An object of class \code{"BayesMallowsData"}, to be provided in the \code{data}
argument to \code{\link[=compute_mallows]{compute_mallows()}}.
}
\description{
Prepare rank or preference data for further analyses.
}
\note{
Setting \code{random=TRUE} means that all possible orderings of each
assessor's preferences are generated, and one of them is picked at random.
This can be useful when experiencing convergence issues, e.g., if the MCMC
algorithm does not mix properly. However, finding all possible orderings is
a combinatorial problem, which may be computationally very hard. The result
may not even be possible to fit in memory, which may cause the R session to
crash. When using this option, please try to increase the size of the
problem incrementally, by starting with smaller subsets of the complete
data. An example is given below.

It is assumed that the items are labeled starting from 1. For example, if a
single comparison of the following form is provided, it is assumed that
there is a total of 30 items (\code{n_items=30}), and the initial ranking is a
permutation of these 30 items consistent with the preference 29<30.

\tabular{rrr}{
\strong{assessor} \tab \strong{bottom_item} \tab \strong{top_item}\cr
1 \tab 29 \tab 30\cr
}

If in reality there are only two items, they should be relabeled to 1 and
2, as follows:

\tabular{rrr}{
\strong{assessor} \tab \strong{bottom_item} \tab \strong{top_item}\cr
1 \tab 1 \tab 2\cr
}
}
\references{
\insertAllCited{}
}
\seealso{
Other preprocessing: 
\code{\link{get_transitive_closure}()},
\code{\link{set_compute_options}()},
\code{\link{set_initial_values}()},
\code{\link{set_model_options}()},
\code{\link{set_priors}()},
\code{\link{set_smc_options}()}
}
\concept{preprocessing}
