% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/tokenize.R
\name{tokenize}
\alias{tokenize}
\title{Tokenize sentences using 'MeCab'}
\usage{
tokenize(
  x,
  text_field = "text",
  docid_field = "doc_id",
  sys_dic = "",
  user_dic = "",
  split = FALSE,
  partial = FALSE,
  grain_size = 1L,
  mode = c("parse", "wakati")
)
}
\arguments{
\item{x}{A data.frame like object or a character vector to be tokenized.}

\item{text_field}{<\code{\link[rlang:args_data_masking]{data-masked}}>
String or symbol; column name where to get texts to be tokenized.}

\item{docid_field}{<\code{\link[rlang:args_data_masking]{data-masked}}>
String or symbol; column name where to get identifiers of texts.}

\item{sys_dic}{Character scalar; path to the system dictionary for 'MeCab'.
Note that the system dictionary is expected to be compiled with UTF-8,
not Shift-JIS or other encodings.}

\item{user_dic}{Character scalar; path to the user dictionary for 'MeCab'.}

\item{split}{Logical. When passed as \code{TRUE}, the function internally splits the sentences
into sub-sentences using \code{stringi::stri_split_boundaries(type = "sentence")}.}

\item{partial}{Logical. When passed as \code{TRUE}, activates partial parsing mode.
To activate this feature, remember that all spaces at the start and end of
the input chunks are already squashed. In particular, trailing spaces
of chunks sometimes cause fatal errors.}

\item{grain_size}{Integer value larger than 1.
This argument is internally passed to \code{RcppParallel::parallelFor} function.
Setting a larger chunk size could improve the performance in some cases.}

\item{mode}{Character scalar to switch output format.}
}
\value{
A tibble or a named list of tokens.
}
\description{
Tokenize sentences using 'MeCab'
}
\examples{
\dontrun{
df <- tokenize(
  data.frame(
    doc_id = seq_along(ginga[5:8]),
    text = ginga[5:8]
  )
)
head(df)
}
}
