\encoding{utf8}
\name{tost.rank.sum}
\alias{Two-sample rank sum test for stochastic equivalence}
\alias{tost.rank.sum}
\concept{Stochastic equivalence}
\concept{Rank sums}
\concept{Two One-Sided Tests}
\title{Two-sample rank sum test for stochastic equivalence}
\description{
  \loadmathjax
  Performs two one-sided approximate \emph{z} tests for stochastic equivalence between two independent samples.}
\usage{
tost.rank.sum(
    x, by, 
    eqv.type     = equivalence.types, 
    eqv.level    = 1, 
    upper        = NA, 
    conf.level   = 0.95, 
    x.name       = "", 
    by.name      = "",
    by.values    = NULL,
    ccontinuity  = FALSE, 
    relevance    = TRUE)

equivalence.types
#c("delta", "epsilon")
}
\arguments{
 \item{x}{a numeric vector of data values.}
 \item{by}{a numeric or factor vector of exactly two values indicating group membership.}
 \item{eqv.type}{defines whether the equivalence interval will be defined in terms of \mjeqn{\varepsilon}{epsilon} or \mjeqn{\Delta}{Delta} (\code{"epsilon"}, or \code{"delta"}).  These options change the way that \code{evq.level} is interpreted: when \code{"epsilon"} is specified, the \code{evq.level} is measured in units of the \emph{z} distribution, and when \code{"delta"} is specified, the \code{evq.level} is measured in the units of rank sums; put another way \mjeqn{\varepsilon = \frac{\Delta}{\text{standard error}}}{epsilon = Delta/standard error}.  Because units of rank sums is unlikely to be substantively meaningful, the default is \code{"epsilon"}.  \cr \cr  Defining tolerance in terms of \mjeqn{\varepsilon}{epsilon} means that it is not possible to reject any test for mean equivalence's \mjeqn{\text{H}_{0}^{-}}{Ho} if \mjeqn{\varepsilon \le z_{\alpha}}{epsilon <= the critical value of \emph{z} for a given alpha}.  Because \mjeqn{\varepsilon = \frac{\Delta}{\text{standard error}}}{epsilon = Delta/standard error}, we can see that it is not possible to reject any \mjeqn{\text{H}_{0}^{-}}{Ho} if \mjeqn{\Delta \le \text{standard error} \times z_{\alpha}}{Delta <= the product of the standard error and critical value of \emph{z} for a given alpha}.  \code{tost.rank.sum} reports when either of these conditions obtain.}
 \item{eqv.level}{defines the equivalence threshold for the tests depending on whether \code{eqv.type} is \code{"epsilon"} or \code{"delta"} (see above).  Researchers are responsible for choosing meaningful values of \mjeqn{\varepsilon}{epsilon} or \mjeqn{\Delta}{Delta}.  The default value is 1, which is not a useful value for either \code{eqv.type="delta"} or \code{eqv.type="epsilon"}.}
 \item{upper}{defines the upper equivalence threshold for the test, is assumed to be positive, and transforms the meaning of \code{eqv.level} to mean the \emph{lower} equivalence threshold for the test.  Also, \code{eqv.level} is assumed to be a negative value.  Taken together, these correspond to Schuirmann's (1987) asymmetric equivalence intervals.  If \code{upper==abs(eqv.level)}, then \code{upper} will be ignored.}
 \item{conf.level}{confidence level of the interval, and complement of the test's nominal type I error rate \mjeqn{\alpha}{alpha}.}
 \item{x.name}{specifies how the outcome variable will be labeled in the output. The default value of \code{x.name} is the variable name of \code{x}.}
 \item{by.name}{specifies how the grouping variable will be labeled in the output. The default value of \code{by.name} is the variable name of \code{by}.}
 \item{by.values}{a string vector of exact two values specifying how group names will be labeled in the output. The default value of \code{by.names} are the factor labels or, if those are \code{NA} the factor levels of \code{by}.}
 \item{ccontinuity}{calculates test statistics for both positivist and negativist tests using a continuity correction.  For the positivist test the approximate statistic \mjeqn{z = \tfrac{\text{sgn}(W)\times(|W-\mu_{W}|-0.5)}{\sigma_{W}}}{z = (sgn(\emph{W})×|\emph{W} -- mu_\emph{W}| -- 0.5) ÷ sigma_\emph{W}}.  \cr \cr  For the negativist test using \mjeqn{\varepsilon}{epsilon} the approximate test statsitics are \mjeqn{z_1 = \varepsilon_{\text{u}} - z}{z_1 = epsilon_u -- \emph{z}}, and \mjeqn{z_2 = z - \varepsilon_{\text{l}}}{z_2 = \emph{z} -- epsilon_l} (where \mjeqn{z}{\emph{z}} is the continuity-corrected test statistic from the positivist test).  \cr \cr  For the negativist test using \mjeqn{\Delta}{Delta} approximate statistics are \mjeqn{z_1 = \tfrac{\Delta_{\text{u}} - [\text{sgn}(W)\times(|W-\mu_{W}|-0.5)]}{\sigma_{W}}}{z_1 = (Delta_u -- (sgn(\emph{W})×|\emph{W} -- mu_\emph{W}| -- 0.5)) ÷ sigma_\emph{W}} and \mjeqn{z_2 = \tfrac{[\text{sgn}(W)\times(|W-\mu_{W}|-0.5)]-\Delta_{\text{l}}}{\sigma_{W}}}{z_2 = ((sgn(\emph{W})×|\emph{W} -- mu_\emph{W}| -- 0.5) -- Delta_l) ÷ sigma_\emph{W}}.}
 \item{relevance}{reports results and inference for combined tests for difference and for equivalence for a specific \code{conf.level}, \code{eqv.type}, \code{eqv.level}, and, if used, \code{upper}.  See the Remarks section more details on inference from combined tests.}
}
\details{\code{tost.rank.sum}  tests the null hypothesis that the paired differences in measures are not symmetrically distributed and/or are not centered on the value of zero, and provides evidence for the distribution paired differences being equivalence to one that is symmetric and centered on zero.  \code{tost.rank.sum} uses the \emph{z} approximation to the rank sum test (Wilcoxon, 1945; Mann and Whitney, 1947) in a two one-sided tests approach (Schuirmann, 1987).

With respect to the rank sum test, a negativist null hypothesis takes one of the following two forms depending on whether tolerance is defined in terms of \mjeqn{\Delta}{Delta} (equivalence expressed in units of rank sums) or in terms of \mjeqn{\varepsilon}{espilon} (equivalence expressed in the units of the \emph{z} distribution):

\emph{}\mjeqn{\phantom{22}\text{H}_{0}^{-}\text{: }|W - \mu_W| \ge \Delta}{&nbsp;&nbsp;Ho: |\emph{W} -- mu_\emph{W}| >= Delta},\cr
\emph{}\mjeqn{\phantom{22}}{  }where the equivalence interval ranges from \mjeqn{\left(W - \mu_W\right) - \Delta}{(\emph{W} -- mu_\emph{W}) -- Delta} to \mjeqn{\left(W - \mu_W\right) + \Delta}{(\emph{W} -- mu_\emph{W}) + Delta.} This translates directly into two one-sided null hypotheses:

\emph{}\mjeqn{\phantom{2222}\text{  H}_{01}^{-}\text{: }W - \mu_W \ge \Delta}{&nbsp;&nbsp;&nbsp;&nbsp;Ho1: \emph{W} -- mu_\emph{W} >= Delta}, or\cr
\emph{}\mjeqn{\phantom{2222}\text{  H}_{02}^{-}\text{: }W - \mu_W \le -\Delta}{&nbsp;&nbsp;&nbsp;&nbsp;Ho1: \emph{W} -- mu_\emph{W} <= Delta}.

--OR--

\emph{}\mjeqn{\phantom{22}\text{H}_{0}^{-}\text{: }|Z| \ge \varepsilon ,}{&nbsp;&nbsp;Ho: |Z| >= epsilon,}\cr
\emph{}\mjeqn{\phantom{22}}{  }where the equivalence interval ranges from \mjeqn{-\varepsilon}{--epsilon} to \mjeqn{\varepsilon}{epsilon}. This also translates directly into two one-sided null hypotheses:

\emph{}\mjeqn{\phantom{2222}\text{H}_{01}^{-}\text{: }Z \ge \varepsilon}{&nbsp;&nbsp;&nbsp;&nbsp;Ho1: Z >= epsilon}; or\cr
\emph{}\mjeqn{\phantom{2222}\text{H}_{02}^{-}\text{: }Z \le -\varepsilon}{&nbsp;&nbsp;&nbsp;&nbsp;Ho2: Z <= --epsilon}.

When an asymmetric equivalence interval is defined using the \code{upper} option the general negativist null hypothesis becomes:

\emph{}\mjeqn{\phantom{22}\text{H}_{0}^{-}\text{: }W - \mu_W \le \Delta_{\text{l}}}{&nbsp;&nbsp;Ho: \emph{W} -- mu_\emph{W} <= Delta_lower}, or \mjeqn{W - \mu_W \ge \Delta_{\text{u}}}{Ho: \emph{W} -- mu_\emph{W} >= Delta_upper}\cr
\emph{}\mjeqn{\phantom{22}}{  }where the equivalence interval ranges from \mjeqn{\left(W - \mu_W\right) + \Delta_{\text{l}}}{(\emph{W} -- mu_\emph{W}) + Delta_lower} to \mjeqn{\left(W - \mu_W\right) + \Delta_{\text{u}}}{(\emph{W} -- mu_\emph{W}) + Delta_upper}.  This also translates directly into two one-sided null hypotheses:

\emph{}\mjeqn{\phantom{2222}\text{H}_{01}^{-}\text{: }W - \mu_W \ge \Delta_{\text{u}}}{&nbsp;&nbsp;&nbsp;&nbsp;Ho1: \emph{W} -- mu_\emph{W} >= Delta_upper}; or\cr
\emph{}\mjeqn{\phantom{2222}\text{H}_{02}^{-}\text{: }W - \mu_W \le \Delta_{\text{l}}}{&nbsp;&nbsp;&nbsp;&nbsp;Ho2: \emph{W} -- mu_\emph{W} <= Delta_lower}.

--OR--

\emph{}\mjeqn{\phantom{22}\text{H}_{0}^{-}\text{: }Z \le \varepsilon_{\text{l}}}{&nbsp;&nbsp;Ho: Z <= epsilon_lower}, or \mjeqn{Z \ge \varepsilon_{\text{u}}}{Z >= epsilon_upper}, with:

\emph{}\mjeqn{\phantom{2222}\text{H}_{01}^{-}\text{: }Z \ge \varepsilon_{\text{u}}}{&nbsp;&nbsp;&nbsp;&nbsp;Ho1: Z >= epsilon_upper}; or\cr
\emph{}\mjeqn{\phantom{2222}\text{H}_{02}^{-}\text{: }Z \le \varepsilon_{\text{l}}}{&nbsp;&nbsp;&nbsp;&nbsp;Ho2: Z <= epsilon_lower}.\cr
 
NOTE: the appropriate level of \mjeqn{\alpha = (1 - }{alpha =  (1 -- }\code{conf.level}\mjeqn{)}{)} is precisely the same as in the corresponding two-sided test for mean difference, so that, for example, if one wishes to make a type I error \%1 of the time, one simply conducts both of the one-sided tests of \mjeqn{\text{H}_{01}^{-}}{Ho1} and \mjeqn{\text{H}_{02}^{-}}{Ho2} by comparing the resulting p-value to 0.01 (Wellek, 2010).

\subsection{Remarks}{Following Tryon and Lewis (2008), when rejection decisions from both tests for difference (e.g., \mjeqn{\text{H}_{0}^{+}\text{: }W - \mu_W = 0}{positivist Ho: \emph{W} -- mu_\emph{W} = 0} or ) and tests for equivalence (e.g., either \mjeqn{\text{H}_{0}^{-}\text{: }|W- \mu_W| \ge \Delta}{negativist Ho: |\emph{W} -- mu_\emph{W}| >= Delta}, or \mjeqn{\text{H}_{0}^{-}\text{: }|Z| \ge \varepsilon}{negativist Ho: |Z| >= epsilon}) are combined, there are four possible interpretations for a given \mjeqn{\alpha}{alpha} and \mjeqn{\Delta}{Delta} or \mjeqn{\varepsilon}{epsilon}:

\enumerate{
\item One may reject \mjeqn{\text{H}_{0}^{+}}{the positivist Ho}, but fail to reject \mjeqn{\text{H}_{0}^{-}}{the negativist Ho}, and conclude that there is \bold{relevant \mjeqn{\boldsymbol{0}^{\textbf{th}}}{0th}-order stochastic dominance} between the first and second groups which is at least as large as \mjeqn{\varepsilon}{epsilon} or \mjeqn{\Delta}{Delta}.

\item One may fail to reject \mjeqn{\text{H}_{0}^{+}}{the positivist Ho}, but reject \mjeqn{\text{H}_{0}^{-}}{the negativist Ho}, and conclude that there is \bold{\mjeqn{\boldsymbol{0}^{\textbf{th}}}{0th}-order stochastic equivalence} between the first and second groups within the equivalence range (i.e. defined by \mjeqn{\varepsilon}{epsilon} or \mjeqn{\Delta}{Delta}).

\item One may reject both \mjeqn{\text{H}_{0}^{+}}{the positivist Ho} and \mjeqn{\text{H}_{0}^{-}}{the negativist Ho}, and conclude that there is a \bold{trivial \mjeqn{\boldsymbol{0}^{\textbf{th}}}{0th}-order stochastic dominance} between the first and second groups which lies within the equivalence range (i.e. defined by \mjeqn{\varepsilon}{epsilon} or \mjeqn{\Delta}{Delta}).

\item One may fail to reject both \mjeqn{\text{H}_{0}^{+}}{the positivist Ho} and \mjeqn{\text{H}_{0}^{-}}{the negativist Ho}, and draw an \bold{indeterminate} conclusion, because the data are underpowered to detect either \mjeqn{0^{\text{0th}}}{th}-order stochastic dominance or equivalence.
}
}
}
\value{
\code{tost.rank.sum} returns:
 \item{statistics}{a vector of the \emph{z} statistics for the two one-sided tests; if \code{relevance=TRUE}, these are followed by the value of the \emph{z} statistic for the postivist test for difference.}
 \item{p.values}{a vector of \emph{p} values for the \emph{z} tests.}
 \item{rank_sums}{a vector containing the rank sums in each group, and the rank sum expected under the positivist null hypothesis.}
 \item{sample_sizes}{a vector containing the sample sizes in both groups, as well as the combined sample size of both groups.}
 \item{var_adj}{a scalar containing the adjusted variance under the postivist null hypothesis.}
 \item{threshold}{a scalar containing the equivalence threshold when \code{eqv.type="delta"} and \code{upper=NA}. A vector containing the asymmetric equivalence thresholds \code{upper}, and \code{eqv.level} when \code{eqv.type="delta"}. A scalar containing the equivalence threshold when \code{eqv.type="epsilon"} and \code{upper=NA}. A vector containing the asymmetric equivalence thresholds \code{upper}, and \code{eqv.level} when \code{eqv.type="epsilon"}.}
 \item{conclusion}{a string containing the relevance test conclusion when \code{relevance=TRUE}.}
 }
\author{
Alexis Dinno (\email{alexis.dinno@pdx.edu})

Please contact me with any questions, bug reports or suggestions for improvement.  Fixing bugs will be facilitated by sending along:
\enumerate{
\item a copy of the data (de-labeled or anonymized is fine),\cr
\item a copy of the command syntax used, and\cr
\item a copy of the exact output of the command.\cr
}
I am endebted to my winter 2013 and fall 2023 students for their inspiration. Much appreciation to Mick McVeety for troubleshooting the translation of my Stata \bold{tost} package to R.
\subsection{Suggested citation}{Dinno, A.  2025.  \bold{tost.rank.sum}: Equivalence signed rank tests.  In: \bold{tost.suite} R software package.}
}
\references{

Mann, H. B., and D. R. Whitney. (1947)  \href{https://www.jstor.org/stable/2236101}{On a test whether one of two random variables is stochastically larger than the other}.
\emph{Annals of Mathematical Statistics} \bold{18}, 50--60.

Schuirmann, D. A.  (1987)  \href{https://pubmed.ncbi.nlm.nih.gov/3450848/}{A comparison of the two one-sided tests procedure and the power approach for assessing the equivalence of average bioavailability}. \emph{Journal of Pharmacokinetics and Biopharmaceutics}. \bold{15}, 657--680.

Snedecor, G. W., and W.  G.  Cochran.  (1989)  \href{https://www.wiley.com/en-us/Statistical\%20Methods,\%208th\%20Edition-p-9780813815619}{\emph{Statistical Methods"}}.  8th ed. Ames, IA: Iowa State University Press.

Tryon, W. W., and C. Lewis.  (2008)  \href{https://pubmed.ncbi.nlm.nih.gov/18778155/}{An inferential confidence interval method of establishing statistical equivalence that corrects Tryon's (2001) reduction factor}. \emph{Psychological Methods}. \bold{13}, 272--277.

Wellek, S.  (2010)  \href{https://www.routledge.com/Testing-Statistical-Hypotheses-of-Equivalence-and-Noninferiority/Wellek/p/book/9781439808184}{\emph{Testing Statistical Hypotheses of Equivalence and Noninferiority}},  Second edition.  Chapman and Hall/CRC Press. p. 31.

Wilcoxon, F.  (1945)  \href{http://www.jstor.org/stable/3001968}{Individual comparisons by ranking methods}. \emph{Biometrics Bulletin}. \bold{1}, 80--83.
}
\seealso{
  \code{\link{tost.sign.rank}}, \code{\link{wilcox.test}}, \code{\link{Wilcoxon}}.
}
\examples{
require("webuse")

# Setup
webuse("fuel2")

# Perform two-sample rank-sum relevance test on mpg by using the two
# groups defined by treat; equivalence interval is +/- 1 sd beyond the
# critical value of Z for alpha = 0.1.
tost.rank.sum(
    x=fuel2$mpg, 
    by=fuel2$treat, 
    eqv.type="epsilon", 
    eqv.level=qnorm(.9)+1, 
    conf.level=.9, 
    relevance=TRUE)

# Perform asymmetric rank-sum relevance test on mpg by using the two
# two groups defined by treat, and add a continuity correction.
# The lower end of the equivalence interval = qnorm(.9)+1=2.281552
# meaning equivalence must lay no more than 1 sd beyond the critical value
# of Z for alpha = 0.1.  The upper end of the equivalence interval
# = qnorm(.9)+1.5 = 1.781552 meaning equivalence must lay no more than
# 0.5 sd beyond the critical value of Z for alpha = 0.1.
tost.rank.sum(
    x=fuel2$mpg, 
    by=fuel2$treat, 
    eqv.type="epsilon", 
    eqv.level=qnorm(.9)+1, 
    upper=qnorm(.9)+.5, 
    conf.level=.9, 
    ccontinuity=TRUE, 
    relevance=TRUE)
}
\keyword{htest}
\keyword{stats}
