\name{calc.relimp}
\alias{calc.relimp}
\alias{calc.relimp.default}
\alias{calc.relimp.formula}
\alias{calc.relimp.lm}
%- Also NEED an '\alias' for EACH other topic documented here.
\title{ Function to calculate relative importance metrics for linear models }
\description{
  calc.relimp calculates several relative importance metrics for the linear model. 
The recommended metrics are \code{lmg} (\eqn{R^2} partitioned by averaging over orders, like in Lindemann, Merenda and Gold (1980, p.119ff)) 
and \code{pmvd} (a newly proposed metric by Feldman (2005) that is provided in the non-US version of the package only). 
For completeness and comparison purposes, several other metrics are also on offer (cf. e.g. Darlington (1968)).
}
\usage{

## generic function
calc.relimp(object, ...)

## default S3 method, should be called without suffix ".default"
calc.relimp.default(object, x = NULL, ..., 
       type = "lmg", diff = FALSE, rank = TRUE, rela = FALSE, always = NULL, 
       groups = NULL, groupnames = NULL, weights=NULL, design=NULL)

## S3 method for formula object, should be called without suffix ".formula"
calc.relimp.formula(formula, data, weights, na.action, ..., subset=NULL)

## S3 method for objects of class lm
calc.relimp.lm(object, type = "lmg", groups = NULL, groupnames=NULL, always = NULL, ...)

}
\arguments{
  \item{object   }{ The class of this object determines which of the methods is used:
        There are special methods for output objects from function \code{\link{lm}} 
        (or linear model objects inheriting from class lm 
        generated by other functions like \code{\link{glm}} and \code{svyglm})
        and for formula objects.
        For all other types of object, the default method is used.

        Thus, object can be

        a formula (e.g. y\~x1+x2+x3+x2:x3) (cf. below for details)

        OR

        the output of a linear model call (inheriting from class \code{lm}, but not \code{mlm}); 
        output objects from \code{lm}, \code{glm}, \code{svyglm} or \code{aov} work 
        (if linear with identity link in case of glm's);
        there may be further functions that output objects inheriting from \code{lm} 
        which may or may not work reasonably with \code{calc.relimp}; 
        for \code{calc.relimp} to be appropriate, the underlying model must at least be linear!
        
        The restrictions on usage of interactions listed under item formula below also 
        apply to linear model objects. 

        OR 

        the covariance matrix of a response y and regressors x, 
        (e.g. obtained by cov(cbind(y,x)), if y is a column vector of response values 
        and x a corresponding matrix of regressors) 

        OR 
        
        a (raw) data matrix or data frame with the response variable in the first column


        OR

        a response vector or one-column matrix, 
        if \code{x} contains the corresponding matrix or data frame of regressors.}

  \item{formula   }{ The first object, if a formula is to be given; 
        one response only. 
        
        Interaction terms are currently limited to second-order.
        
        Note: If several interaction terms are given, calculations may be very 
        resource intensive, if these are all connected (e.g. with A:B, B:C, C:D, all A,B,C,D are connected, 
        while with A:B, C:D, D:E there are separate groups A,B and C,D,E). 
        
        Interaction terms occurring in always do not increase resource usage 
        (but are only permitted if the respective main effects also occur in always).
        
        Interactions and groups currently cannot be used simultaneously. 
        }
  \item{x   }{ a (raw) data matrix or data frame containing the regressors, 
       if \code{object} is a response vector or one-column matrix

        OR 
        
        NULL, if \code{object} is anything else }
  \item{type   }{ can be a character string, character vector or list of character strings.
       It is the collection of metrics that are to be calculated. 
       Available metrics: \code{lmg}, \code{pmvd} (non-US version only), \code{last}, \code{first}, 
       \code{betasq}, \code{pratt}, \code{genizi} and \code{car}. For brief sketches of their meaning cf. details section. }
  \item{diff   }{ logical; if TRUE, pairwise differences between the relative contributions are calculated; default FALSE }
  \item{rank   }{ logical; if TRUE, ranks of regressors in terms of relative contributions are calculated; default TRUE }
  \item{rela   }{ is a logical requesting relative importances summing to 100\% (\code{rela=TRUE}). 
         If rela is FALSE (default), some of the metrics sum to \eqn{R^2} (\code{lmg}, \code{pmvd}, \code{pratt}), 
         others do not have a meaningful sum (\code{last}, \code{first}, \code{betasq}).}
  \item{always }{ is a vector of column numbers or names of variables to be always in the 
       model (adjusted for). Valid numbers are 2 to (number of regressors + 1) (1 is reserved for the response), 
       valid character strings are all column names of \code{object} or \code{x} respectively 
       that refer to regressor variables. Numbers and names cannot be mixed. 

       Relative importance is only assessed for the variables not selected in \code{always}. 
       
       This option currently does not work for metrics \code{genizi} and \code{car}.}
  \item{groups }{ is a list of vectors of column numbers or names of variables to be combined into groups. 
       If only one group is needed, a vector can be given. The numbers and character strings needed are of the same form 
       as for \code{always}. 

       Relative importance is only allocated between groups of regressors, no subdivision within groups is calculated. 
       Regressors that do not occur in any group are included as singletons.
       A regressor must not occur in \code{always} and in \code{groups}. Also, groups cannot be used with a linear model or a formula 
       in case of higher order effects (interactions). Finally, \code{groups} only works with the four metrics 
       \code{lmg}, \code{pmvd}, \code{last} and \code{first}.}
  \item{groupnames }{ is a vector of names for the variable groups to be used for annotation of output.}
  \item{weights }{ is a vector of case weights for the observations in the data frame (or matrix).
       You can EITHER specify \code{weights} OR a \code{design}.
       Note that weights must not be specified for linear model objects (since these should contain their weights as 
       part of the model).}
  \item{design }{ is a design object of class \code{survey.design} (cf. package \pkg{survey}).
       You can EITHER specify a \code{design} OR \code{weights}. For \code{calc.relimp}, the design is used 
       for calculating weights only.
       Note that it is discouraged (though possible) to specify a design for a conventional linear model object 
       (since a survey-specific linear model should be used for survey data, cf. function \code{svyglm}). 
       
       Also note that care is needed when using \code{subset} together with \code{design}: 
       the \code{subset}-Option only treats the \code{data} handed directly to \code{calc.relimp},
       the \code{design} has to be equivalently treated beforehand. }
  \item{data}{ if first object is of class formula: 
       an optional matrix or data frame that the variables in formula and subset come from; 
       if it is omitted, all names must be meaningful in the environment from which calc.relimp is called}
  \item{subset}{ if first object is of class formula: 
        an optional expression indicating the subset of the observations of \code{data} that should be used in the fit. 
          This can be a logical vector, or a numeric vector indicating which observation numbers are to be included, 
          or a  character  vector of the row names to be included.  All (non-missing) observations are included by
          default. }
  \item{na.action}{ if first object is of class formula: 
        an optional function that indicates what should happen when the data contain 'NA's. 
        The default is first, any na.action attribute of data, second the setting given in the call to calc.relimp,
        third the na.action setting of options. Possible choices are "na.fail",  
        (print an error message and terminate if there are any incomplete observations), 
        "na.omit" or "na.exclude" (equivalent for package \code{relaimpo}, 
        both analyse complete cases only and print a warning, this is also what is done the default method ). }
  \item{...}{ usable for further arguments, 
        particularly most arguments of default method can be given to all other methods 
        (exception: weights and design cannot be given to lm-method) }
}
\details{
\describe{
  \item{lmg}{ is the \eqn{R^2} contribution averaged over orderings among regressors, cf. e.g. Lindeman, Merenda and Gold 1980, 
         p.119ff or Chevan and Sutherland (1991).} 
  \item{pmvd}{ is the proportional marginal variance decomposition as proposed by Feldman (2005) (non-US version only).
         It can be interpreted as a weighted average over orderings among regressors, with data-dependent weights. } 
  \item{last}{ is each variables contribution when included last, also sometimes called usefulness.} 
  \item{first}{ is each variables contribution when included first, which is just the squared covariance between y and the variable.}
  \item{betasq}{ is the squared standardized coefficient.} 
  \item{pratt}{ is the product of the standardized coefficient and the correlation.}
  \item{genizi}{ is the \eqn{R^2} decomposition according to Genizi 1993}
  \item{car}{ is the \eqn{R^2} decomposition according to Zuber and Strimmer 2010, also available from package \pkg{care} 
         (squares of scores produced by function \code{\link[care]{carscore}}
}
Each metric is calculated using the internal function \dQuote{metric}\code{calc}, e.g. \code{lmgcalc}.

Five of the metrics in \code{calc.relimp} (\code{lmg}, \code{pmvd}, \code{pratt}, \code{genizi} and \code{car}), 
decompose the model \eqn{R^2}. 
\code{calc.relimp} (\code{lmg}, \code{pmvd}, \code{pratt}, \code{genizi} and \code{car}) sum to the \eqn{R^2} that is to be decomposed, 
if \code{rela = FALSE} and to 100pct if \code{rela = TRUE}.

The other metrics also (artificially) sum to 100pct if \code{rela = TRUE}. 
If \code{rela = FALSE}, they are given relative to var(y) (or the conditional variance of y after adjusting out the variables 
requested in \code{always}) but do not sum to \eqn{R^2}.

If \code{always} requests some variables to be always in the model, these are conditioned upon 
(i.e. included into the model first). Only the remaining \eqn{R^2} that is not explained by 
these variables is decomposed among the other regressors. This currently does not work for metrics \code{genizi} and \code{car}.

Four of the metrics, \code{lmg}, \code{pmvd}, \code{first} and \code{last}, 
are related to the order in which the variables are included into the model. 
For these it is possible to consider the variables in groups that are always entered into the model together. 

Note that \pkg{relaimpo} can only provide metric \code{lmg} for models with interactions (2-way interactions only). 
It averages only over those orders, for which the interactions enter the model after both their main effects.

Note that there are different types of weights, weights indicating the variability of the response 
(observations with a more variable responses receive a lower weight than those with a less variable response,
like in the Aitken estimator), frequency weights indicating the number of observations with exactly the observed 
data pattern of the current observation, or weights indicating the number of population units represented by the 
current observation (inverse sampling probability, weights typically used in survey situations). All three types of 
weight alike can be handed to function \code{calc.relimp} using the \code{weights=} option. Note, however, that they 
have to be treated differently for bootstrapping (cf. \code{\link{boot.relimp}}).

Data from complex surveys can be treated by providing a survey design with \code{design=}-option. 
For \code{calc.relimp}, it is also sufficient to provide the weights derived from the design using the 
\code{weights=}-option. 

\code{calc.relimp} cannot handle data with missing values directly. It applies complete-case analysis, 
i.e. drops all units with any missing values by default. While this can be appropriate, if there are only few 
missing values, data with more severe missingness issues need special treatment. Package \pkg{relaimpo} 
offers the function \code{mianalyze.relimp} that handles multiply-imputed datasets (that can be created 
by several other \R-packages). Currently, possibilities in this function are limited due to the fact that 
it uses complex survey designs and bootstrapping which do not (yet) go together well with factors, interactions 
and calculated quantities in formulae.

}
}
\value{
  \item{var.y }{the variance of the response}
  \item{R2 }{the coefficient of determination, \eqn{R^2}}
  \item{R2.decomp }{the part of the coefficient of determination that is decomposed among the 
                variables under investigation }
  \item{lmg }{vector of relative contributions obtained from the \code{lmg} method, if \code{lmg} has been requested in \code{type}}
  \item{lmg.diff }{vector of pairwise differences between relative contributions obtained from the \code{lmg} method, 
if \code{lmg} has been requested in \code{type} and \code{diff=TRUE}}
  \item{lmg.rank }{rank of the regressors relative contributions obtained from the \code{lmg} method, 
if \code{lmg} has been requested in \code{type} and \code{rank=TRUE}}
  \item{metric, metric.diff, metric.rank }{analogous to \code{lmg} for other metrics}
  \item{ave.coeffs}{average coefficients for variables not not requested by always only 
            for models of different sizes;
            
            note that coefficients refer to modeling residuals after adjusting out variables 
            listed in always (both from response and other explanatory variables)}
  \item{namen}{names of variables, starting with response}
  \item{type}{character vector of metrics available}
  \item{rela}{Have metrics been normalized to sum 100\% ?}
  \item{always}{column numbers of variables always in the model; 
         in case of factors, the column numbers given here are not identical to those in 
         the call to \code{calc.relimp}, but refer to the columns of the model matrix}
  \item{alwaysnam}{names of variables always in the model}
  \item{call}{contains the call that generated the object}
}
\references{ 
Chevan, A. and Sutherland, M. (1991) Hierarchical Partitioning. \emph{The American Statistician} \bold{45}, 90--96.

Darlington, R.B. (1968) Multiple regression in psychological research and practice. \emph{Psychological Bulletin} \bold{69}, 161--182.

Feldman, B. (2005) Relative Importance and Value. Manuscript (Version 1.1, March 19 2005), downloadable at \url{http://www.prismanalytics.com/docs/RelativeImportance050319.pdf}

Genizi, A. (1993) Decomposition of R2 in multiple regression with correlated regressors. \emph{Statistica Sinica} \bold{3}, 407--420. 
Downloadable at \url{http://www3.stat.sinica.edu.tw/statistica/password.asp?vol=3&num=2&art=10}

Groemping, U. (2006) Relative Importance for Linear Regression in R: The Package relaimpo 
         \emph{Journal of Statistical Software} \bold{17}, Issue 1. 
         Downloadable at \url{http://www.jstatsoft.org/v17/i01}

Lindeman, R.H., Merenda, P.F. and Gold, R.Z. (1980) \emph{Introduction to Bivariate and Multivariate Analysis}, Glenview IL: Scott, Foresman.

Zuber, V. and Strimmer, K. (2010) \emph{Variable importance and model selection by decorrelation}. Preprint, downloadable at \url{http://www.uni-leipzig.de/strimmer/lab/publications/preprints/carscore2010.pdf}

Go to \url{http://prof.beuth-hochschule.de/groemping/relaimpo/} for further information and references.
}
\author{ Ulrike Groemping, BHT Berlin }
\note{ There are two versions of this package. The version on CRAN is globally licensed under GPL version 2 (or later). 
There is an extended version with the interesting additional metric \code{pmvd} that is licensed according to GPL version 2
under the geographical restriction "outside of the US" because of potential issues with US patent 6,640,204. 
This version can be obtained from Ulrike Groempings website (cf. references section). 
Whenever you load the package, a display tells you, which version you are loading. }

\section{Warning }{\code{lmg} and \code{pmvd} are computer-intensive. Although they are calculated based on the 
covariance matrix, which saves substantial computing time in comparison to carrying out actual regressions, 
these methods still take quite long for problems with many regressors.

\code{relaimpo} is a package for univariate linear models. 
Using \code{relaimpo} on objects that inherit from class \code{lm} but are not univariate linear model objects 
may produce nonsensical results without warning. Objects of class \code{mlm} or \code{glm} with link functions other than identity 
or family other than gaussian lead to an error message. 
} 

\seealso{ \pkg{\link{relaimpo}}, \code{\link{booteval.relimp}}, \code{\link{mianalyze.relimp}}, 
     \code{\link{classesmethods.relaimpo}} }
\examples{
#####################################################################
### Example: relative importance of various socioeconomic indicators 
###          for Fertility in Switzerland
### Fertility is first column of data set swiss
#####################################################################
data(swiss)
    calc.relimp(swiss, 
       type = c("lmg", "last", "first", "betasq", "pratt", "genizi", "car") )
    # calculation of all available relative importance metrics 
        # non-US version offers the additional metric "pmvd", 
        # i.e. call would be 
        # calc.relimp(cov(swiss), 
        # type = c("lmg", "pmvd", "last", "first", "betasq, "pratt"), 
        # rela = TRUE )
    ## same analysis with formula or lm method and a few modified options
    crf <- calc.relimp(Fertility~Agriculture+Examination+Education+Catholic+Infant.Mortality,swiss, 
        subset = Catholic>40,
        type = c("lmg", "last", "first", "betasq", "pratt"), rela = TRUE )
    crf
    linmod <- lm(Fertility~Agriculture+Examination+Education+Catholic+Infant.Mortality,swiss)
    crlm <- calc.relimp(linmod, 
        type = c("lmg", "last", "first", "betasq", "pratt", "genizi", "car"), rela = TRUE )
    plot(crlm)
    # bar plot of the relative importance metrics

    #of statistical interest in this context: correlation matrix
       cor(swiss)

    #demonstration of conditioning on one regressor using always
    calc.relimp(swiss, 
       type = c("lmg", "last", "first", "betasq", "pratt"), rela = FALSE,
       always = "Education" )

    # using calc.relimp with grouping of two regressors
    # and weights (not reasonable here, purely for demo purposes)
    calc.relimp(swiss, 
       type = c("lmg", "last", "first"), rela = FALSE,
       groups = c("Education","Examination"), weights = abs(-23:23) )

    # using calc.relimp with grouping of two regressors
    # and a design object (not reasonable here, purely for demo purposes)
    des <- svydesign(~1, data=swiss, weights=~abs(-23:23))
    calc.relimp(swiss, 
       type = c("lmg", "last", "first"), rela = FALSE,
       groups = c("Education","Examination"), groupnames ="EduExam", design = des)

    # calc.relimp with factors (betasq and pratt not possible)
      # (calc.relimp would not be necessary here, 
      # because the experiment is balanced)
    calc.relimp(1/time~poison+treat,data=poisons, rela = FALSE,
         type = c("lmg", "last", "first"))
    # including also the interaction (lmg possible only)
    calc.relimp(1/time~poison*treat,data=poisons, rela = FALSE)
}
\keyword{ multivariate }% at least one, from doc/KEYWORDS
\keyword{ models }% __ONLY ONE__ keyword per line
