% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/utils.R
\name{lma_patcat}
\alias{lma_patcat}
\title{Categorize Texts}
\usage{
lma_patcat(text, dict = NULL, pattern.weights = "weight",
  pattern.categories = "category", bias = NULL, to.lower = TRUE,
  return.dtm = FALSE, drop.zeros = FALSE, exclusive = TRUE,
  boundary = NULL, fixed = TRUE, globtoregex = FALSE,
  name.map = c(intname = "_intercept", term = "term"),
  dir = getOption("lingmatch.dict.dir"))
}
\arguments{
\item{text}{A vector of text to be categorized. Texts are padded by 2 spaces, and potentially lowercased.}

\item{dict}{At least a vector of terms (patterns), usually a matrix-like object with columns for terms,
categories, and weights.}

\item{pattern.weights}{A vector of weights corresponding to terms in \code{dict}, or the column name of
weights found in \code{dict}.}

\item{pattern.categories}{A vector of category names corresponding to terms in \code{dict}, or the column name of
category names found in \code{dict}.}

\item{bias}{A constant to add to each category after weighting and summing. Can be a vector with names
corresponding to the unique values in \code{dict[, category]}, but is usually extracted from dict based
on the intercept included in each category (defined by \code{name.map['intname']}).}

\item{to.lower}{Logical indicating whether \code{text} should be converted to lowercase before processing.}

\item{return.dtm}{Logical; if \code{TRUE}, only a document-term matrix will be returned, rather than the
weighted, summed, and biased category values.}

\item{drop.zeros}{logical; if \code{TRUE}, categories or terms with no matches will be removed.}

\item{exclusive}{Logical; if \code{FALSE}, each dictionary term is searched for in the original text.
Otherwise (by default), terms are sorted by length (with longer terms being searched for first), and
matches are removed from the text (avoiding subsequent matches to matched patterns).}

\item{boundary}{A string to add to the beginning and end of each dictionary term. If \code{TRUE},
\code{boundary} will be set to \code{' '}, avoiding pattern matches within words. By default, dictionary
terms are left as entered.}

\item{fixed}{Logical; if \code{FALSE}, patterns are treated as regular expressions.}

\item{globtoregex}{Logical; if \code{TRUE}, initial and terminal asterisks are replaced with \code{\\\\b\\\\w*}
and \code{\\\\w*\\\\b} respectively. This will also set \code{fixed} to \code{FALSE} unless fixed is specified.}

\item{name.map}{A named character vector:
\itemize{
  \item \strong{\code{intname}}: term identifying category biases within the term list;
    defaults to \code{'_intercept'}
  \item \strong{\code{term}}: name of the column containing terms in \code{dict}; defaults to \code{'term'}
}
Missing names are added, so names can be specified positional (e.g., \code{c('_int',} \code{'terms')}),
or only some can be specified by name (e.g., \code{c(term =} \code{'patterns')}), leaving the rest default.}

\item{dir}{Path to a folder in which to look for \code{dict} if it is the name of a file to be passed to
\code{\link{read.dic}}.}
}
\value{
A matrix with a row per \code{text} and columns per dictionary category, or (when \code{return.dtm = TRUE})
a sparse matrix with a row per \code{text} and column per term. Includes a \code{WC} attribute with original
word counts, and a \code{categories} attribute with row indices associated with each category if
\code{return.dtm = TRUE}.
}
\description{
Categorize raw texts using a pattern-based dictionary.
}
\examples{
# example text
text = c(
  paste(
    "Oh, what youth was! What I had and gave away.",
    "What I took and spent and saw. What I lost. And now? Ruin."
  ),
  paste(
    "God, are you so bored?! You just want what's gone from us all?",
    "I miss the you that was too. I love that you."
  ),
  paste(
    "Tomorrow! Tomorrow--nay, even tonight--you wait, as I am about to change.",
    "Soon I will off to revert. Please wait."
  )
)

# make a document-term matrix with pre-specified terms only
lma_patcat(text, c('bored?!', 'i lo', '. '), return.dtm = TRUE)

# get counts of sets of letter
lma_patcat(text, list(c('a', 'b', 'c'), c('d', 'e', 'f')))

# same thing with regular expressions
lma_patcat(text, list('[abc]', '[def]'), fixed = FALSE)

# match only words
lma_patcat(text, list('i'), boundary = TRUE)

# match only words, ignoring punctuation
lma_patcat(
  text, c('you', 'tomorrow', 'was'), fixed = FALSE,
  boundary = '\\\\b', return.dtm = TRUE
)

\dontrun{

# read in the temporal orientation lexicon from the World Well-Being Project
tempori = read.csv('https://wwbp.org/downloads/public_data/temporalOrientationLexicon.csv')

lma_patcat(text, tempori)

# or use the standardized version
tempori_std = read.dic('wwbp_prospection', dir = '~/Dictionaries')

lma_patcat(text, tempori_std)

## get scores on the same scale by adjusting the standardized values
tempori_std[, -1] = tempori_std[, -1] / 100 *
  select.dict('wwbp_prospection')$selected[, 'original_max']

lma_patcat(text, tempori_std)[, unique(tempori$category)]
}
}
\seealso{
For applying term-based dictionaries (to a document-term matrix) see \code{\link{lma_termcat}}.

Other Dictionary functions: 
\code{\link{download.dict}()},
\code{\link{lma_termcat}()},
\code{\link{read.dic}()},
\code{\link{select.dict}()}
}
\concept{Dictionary functions}
