% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/preprocessing.r
\name{preprocess_tokens}
\alias{preprocess_tokens}
\title{Preprocess tokens in a character vector}
\usage{
preprocess_tokens(
  x,
  context = NULL,
  language = "english",
  use_stemming = F,
  lowercase = T,
  ngrams = 1,
  replace_whitespace = F,
  as_ascii = F,
  remove_punctuation = T,
  remove_stopwords = F,
  remove_numbers = F,
  min_freq = NULL,
  min_docfreq = NULL,
  max_freq = NULL,
  max_docfreq = NULL,
  min_char = NULL,
  max_char = NULL,
  ngram_skip_empty = T
)
}
\arguments{
\item{x}{A character or factor vector in which each element is a token (i.e. a tokenized text)}

\item{context}{Optionally, a character vector of the same length as x, specifying the context of token (e.g., document, sentence). Has to be given if ngram > 1}

\item{language}{The language used for stemming and removing stopwords}

\item{use_stemming}{Logical, use stemming. (Make sure the specify the right language!)}

\item{lowercase}{Logical, make token lowercase}

\item{ngrams}{A number, specifying the number of tokens per ngram. Default is unigrams (1).}

\item{replace_whitespace}{Logical. If TRUE, all whitespace is replaced by underscores}

\item{as_ascii}{Logical. If TRUE, tokens will be forced to ascii}

\item{remove_punctuation}{Logical. if TRUE, punctuation is removed}

\item{remove_stopwords}{Logical. If TRUE, stopwords are removed (Make sure to specify the right language!)}

\item{remove_numbers}{remove features that are only numbers}

\item{min_freq}{an integer, specifying minimum token frequency.}

\item{min_docfreq}{an integer, specifying minimum document frequency.}

\item{max_freq}{an integer, specifying minimum token frequency.}

\item{max_docfreq}{an integer, specifying minimum document frequency.}

\item{min_char}{an integer, specifying minimum number of characters in a term}

\item{max_char}{an integer, specifying maximum number of characters in a term}

\item{ngram_skip_empty}{if ngrams are used, determines whether empty (filtered out) terms are skipped (i.e. c("this", NA, "test"), becomes "this_test") or}
}
\value{
a factor vector
}
\description{
Preprocess tokens in a character vector
}
\examples{
tokens = c('I', 'am', 'a', 'SHORT', 'example', 'sentence', '!')

## default is lowercase without punctuation
preprocess_tokens(tokens)

## optionally, delete stopwords, perform stemming, and make ngrams
preprocess_tokens(tokens, remove_stopwords = TRUE, use_stemming = TRUE)
preprocess_tokens(tokens, context = NA, ngrams = 3)
}
