% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/DistToNearest.R
\name{distToNearest}
\alias{distToNearest}
\title{Distance to nearest neighbor}
\usage{
distToNearest(db, sequenceColumn = "JUNCTION", vCallColumn = "V_CALL",
  jCallColumn = "J_CALL", model = c("hs1f", "m1n", "ham", "aa", "hs5f"),
  normalize = c("length", "none"), symmetry = c("avg", "min"),
  first = TRUE, nproc = 1, fields = NULL, cross = NULL, mst = FALSE)
}
\arguments{
\item{db}{data.frame containing sequence data.}

\item{sequenceColumn}{name of the column containing nucleotide sequences to compare. 
Also used to determine sequence length for grouping.}

\item{vCallColumn}{name of the column containing the V-segment allele calls.}

\item{jCallColumn}{name of the column containing the J-segment allele calls.}

\item{model}{underlying SHM model, which must be one of 
\code{c("m1n", "ham", "aa", "hs5f")}.
See Details for further information.}

\item{normalize}{method of normalization. The default is \code{"length"}, which 
divides the distance by the length of the sequence group. If 
\code{"none"} then no normalization if performed.}

\item{symmetry}{if model is hs5f, distance between seq1 and seq2 is either the
average (avg) of seq1->seq2 and seq2->seq1 or the minimum (min).}

\item{first}{if \code{TRUE} only the first call of the gene assignments 
is used. if \code{FALSE} the union of ambiguous gene 
assignments is used to group all sequences with any 
overlapping gene calls.}

\item{nproc}{number of cores to distribute the function over.}

\item{fields}{additional fields to use for grouping.}

\item{cross}{columns for grouping to calculate distances across groups 
(self vs others).}

\item{mst}{if \code{TRUE}, return comma-separated branch lengths from minimum 
spanning tree.}
}
\value{
Returns a modified \code{db} data.frame with nearest neighbor distances in the 
          \code{DIST_NEAREST} column if \code{crossGrups=NULL} or in the 
          \code{CROSS_DIST_NEAREST} column if \code{crossGroups} was specified.
}
\description{
Get distance of every sequence to its nearest sequence sharing same V gene, J gene, and
sequence length.
}
\details{
The distance to nearest neighbor can be used to estimate a threshold for assigning Ig
sequences to clonal groups. A histogram of the resulting vector is often bimodal, 
with the ideal threshold being a value that separates the two modes.

"hs5f" use distance derived from the \link{HS5FModel}
using \link{calcTargetingDistance}. "hs1f" and "m1n" use \link{HS1FDistance} and 
\link{M1NDistance} to calculate distances respectively. "ham" uses a nucleotide 
hamming distance matrix from \link[alakazam]{getDNAMatrix}, with gaps being zero. 
"aa" uses an amino acid hamming distance matrix from \link[alakazam]{getAAMatrix}.
}
\examples{
# Subset example data to one sample as a demo
data(ExampleDb, package="alakazam")
db <- subset(ExampleDb, SAMPLE == "-1h")

# Use genotyped V assignments, HS1F model, and normalize by junction length
dist_hs1f <- distToNearest(db, vCallColumn="V_CALL_GENOTYPED", 
                           model="hs1f", first=FALSE, normalize="length")
                           
# Plot histogram of non-NA distances
p1 <- ggplot(data=subset(dist_hs1f, !is.na(DIST_NEAREST))) + theme_bw() + 
    ggtitle("Distance to nearest: hs1f") + xlab("distance") +
    geom_histogram(aes(x=DIST_NEAREST), binwidth=0.025, 
                   fill="steelblue", color="white")
plot(p1)

}
\references{
\enumerate{
  \item  Smith DS, et al. Di- and trinucleotide target preferences of somatic 
           mutagenesis in normal and autoreactive B cells. 
           J Immunol. 1996 156:2642-52. 
  \item  Glanville J, Kuo TC, von Budingen H-C, et al. 
           Naive antibody gene-segment frequencies are heritable and unaltered by 
           chronic lymphocyte ablation. 
           Proc Natl Acad Sci USA. 2011 108(50):20066-71.
  \item  Yaari G, et al. Models of somatic hypermutation targeting and substitution based 
           on synonymous mutations from high-throughput immunoglobulin sequencing data. 
           Front Immunol. 2013 4:358.
 }
}
\seealso{
See \link{calcTargetingDistance} for generating nucleotide distance matrices 
          from a \link{TargetingModel} object. See \link{M1NDistance}, 
          \link{HS5FModel}, \link[alakazam]{getDNAMatrix}, and \link[alakazam]{getAAMatrix}
          for individual model details.
}

