% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/decode.R
\name{decode}
\alias{decode}
\alias{decode,corpus-method}
\alias{decode,character-method}
\alias{decode,slice-method}
\alias{decode,partition-method}
\alias{decode,subcorpus-method}
\alias{decode,integer-method}
\title{Decode corpus or subcorpus.}
\usage{
decode(.Object, ...)

\S4method{decode}{corpus}(
  .Object,
  to = c("data.table", "Annotation"),
  p_attributes = NULL,
  s_attributes = NULL,
  decode = TRUE,
  verbose = TRUE
)

\S4method{decode}{character}(
  .Object,
  to = c("data.table", "Annotation"),
  s_attributes = NULL,
  p_attributes = NULL,
  decode = TRUE,
  verbose = TRUE,
  ...
)

\S4method{decode}{slice}(
  .Object,
  to = "data.table",
  s_attributes = NULL,
  p_attributes = NULL,
  decode = TRUE,
  verbose = TRUE
)

\S4method{decode}{partition}(
  .Object,
  to = "data.table",
  s_attributes = NULL,
  p_attributes = NULL,
  decode = TRUE,
  verbose = TRUE
)

\S4method{decode}{subcorpus}(
  .Object,
  to = "data.table",
  s_attributes = NULL,
  p_attributes = NULL,
  decode = TRUE,
  verbose = TRUE
)

\S4method{decode}{integer}(.Object, corpus, p_attributes, boost = NULL)
}
\arguments{
\item{.Object}{The \code{corpus} or \code{subcorpus} to decode.}

\item{...}{Further arguments.}

\item{to}{The class of the returned object, stated as a length-one
\code{character} vector.}

\item{p_attributes}{The positional attributes to decode. If \code{NULL}
(default), all positional attributes will be decoded.}

\item{s_attributes}{The structural attributes to decode. If \code{NULL}
(default), all structural attributes will be decoded.}

\item{decode}{A \code{logical} value, whether to decode token ids and struc
ids to character strings. If \code{FALSE}, the values of columns for p- and
s-attributes will be \code{integer} vectors. If \code{TRUE} (default), the
respective columns are \code{character} vectors.}

\item{verbose}{A \code{logical} value, whether to output progess messages.}

\item{corpus}{A CWB indexed corpus, either a length-one \code{character} vector,
or a \code{corpus} object.}

\item{boost}{A length-one \code{logical} value, whether to speed up decoding
a long vector of token ids by directly by reading in the lexion file from
the data directory of a corpus. If \code{NULL} (default), the internal
decision rule is that \code{boost} will be \code{TRUE} if the corpus is
larger than 10 000 000 million tokens and more than 5 percent of the corpus
are to be decoded.}
}
\value{
The return value will correspond to the class specified by argument
  \code{to}.
}
\description{
Decode \code{corpus} or \code{subcorpus} and return class specified by
argument \code{to}.
}
\details{
The primary purpose of the method is type conversion. By obtaining the corpus
or subcorpus in the format specified by the argument \code{to}, the data can
be processed with tools that do not rely on the Corpus Workbench (CWB).
Supported output formats are \code{data.table} (which can be converted to a
\code{data.frame} or \code{tibble} easily) or an \code{Annotation} object as
defined in the package \code{NLP}. Another purpose of decoding the corpus can
be to rework it, and to re-import it into the CWB (e.g. using the
\code{cwbtools}-package).

An earlier version of the method included an option to decode a single
s-attribute, which is not supported any more. See the
\code{s_attribute_decode} function of the package RcppCWB.

If \code{.Object} is an \code{integer} vector, it is assumed to be a
  vector of integer ids of p-attributes. The \code{decode}-method will
  translate token ids to string values as efficiently as possible. The
  approach taken will depend on the corpus size and the share of the corpus
  that is to be decoded. To decode a large number of integer ids, it is more
  efficient to read the lexicon file from the data directory directly and to
  index the lexicon with the ids rather than relying on
  \code{RcppCWB::cl_id2str}. The internal decision rule is to use the lexicon
  file when the corpus is larger than 10 000 000 million tokens and more than
  5 percent of the corpus are to be decoded. The encoding of the
  \code{character} vector that is returned will be the coding of the locale
  (usually ISO-8859-1 on Windows, and UTF-8 on macOS and Linux machines).
}
\examples{
use("polmineR")

# Decode corpus as data.table
dt <- decode("GERMAPARLMINI", to = "data.table")

# Decode corpus selectively
dt <- decode("GERMAPARLMINI", to = "data.table", p_attributes = "word", s_attributes = "party")

# Decode a subcorpus
sc <- subset(corpus("GERMAPARLMINI"), speaker == "Angela Dorothea Merkel")
dt <- decode(sc, to = "data.table")

# Decode subcorpus selectively
dt <- decode(sc, to = "data.table", p_attributes = "word", s_attributes = "party")

# Decode partition
P <- partition("REUTERS", places = "kuwait", regex = TRUE)
dt <- decode(P)

# Previous versions of polmineR offered an option to decode a single
# s-attribute. This is how you could proceed to get a table with metadata.
dt <- decode(P, s_attribute = "id", decode = FALSE)
dt[, "word" := NULL]
dt[,{list(cpos_left = min(.SD[["cpos"]]), cpos_right = max(.SD[["cpos"]]))}, by = "id"]

# Decode subcorpus as Annotation object
\dontrun{
if (requireNamespace("NLP")){
  library(NLP)
  p <- subset(corpus("GERMAPARLMINI"), date == "2009-11-10" & speaker == "Angela Dorothea Merkel")
  s <- as(p, "String")
  a <- as(p, "Annotation")
  
  # The beauty of having this NLP Annotation object is that you can now use 
  # the different annotators of the openNLP package. Here, just a short scenario
  # how you can have a look at the tokenized words and the sentences.

  words <- s[a[a$type == "word"]]
  sentences <- s[a[a$type == "sentence"]] # does not yet work perfectly for plenary protocols 
}
}
 
# decode vector of token ids
y <- decode(0:20, corpus = "GERMAPARLMINI", p_attributes = "word")
}
\seealso{
To decode a structural attribute, you can use the
  \code{\link{s_attributes}}-method, setting argument \code{unique} as
  \code{FALSE} and \code{\link[RcppCWB]{s_attribute_decode}}. See
  \code{\link{as.VCorpus}} to decode a \code{partition_bundle} object,
  returning a \code{VCorpus} object.
}
