\documentclass[a4paper,12pt]{article}

\begin{document}

\parindent=0pt

\begin{center}

MA181 INTRODUCTION TO STATISTICAL MODELLING

BINOMIAL DISTRIBUTION

\end{center}

\begin{description}

\item[Bernoulli distribution]
Let $X$ be a random variable with probability function

$$p_X(x)=\left\{\begin{array}{ll}\pi,&x=1,\\1-\pi,&x=0.\end{array}\right.$$

Then $X$ follows a \textit{Bernoulli distribution.}

\item[Examples]

\begin{enumerate}

\item
The toss of a coin: $x=1$ if a head shows, $x=0$ if a tail.

\item
The birth of a baby: $x=1$ if a girl, $x=0$ if a boy.

\item
Testing items from a factory: $x=1$ if defective, $x=0$ if good.

\item
Generally: $x=1$ is called success, $x=0$ failure.

\end{enumerate}

If $X_1,X_2,\ldots,X_n\ (n\geq2)$ are independent and identically
distributed (iid) random variables following a Bernoulli
distribution, then they constitute a sequence of \textit{Bernoulli
trials}.

\item[Binomial distribution]
Let $X_1$ and $X_2$ be a sequence of two Bernoulli trials and let
$Y=X_1+X_2$. What is $P(Y=y)$? Since $Y$ can take only the three
values 0,1 and 2, we have

$$P(Y=0)=P(X_1=0\textrm{ and }X_2=0)=(1-\pi)^2,$$

$$P(Y=1)=P[(X_1=0\textrm{ and }X_2=1)\textrm{ or }(X_1=1\textrm{
and }X_2=0)]=2\pi(1-\pi),$$

$$P(Y=2)=P(X_1=1\textrm{ and }X_2=1)=\pi^2.$$

Generally, let $Y=X_1+X_2+\ldots+X_n$. Then

$$P(Y=0)=P(X_1=0,X_2=0,\ldots X_n=0)=(1-\pi)^n,$$

$$P(Y=n)=P(X_1=1,X_2=1,\ldots,X_n=1)=\pi^n\textrm{ and}$$

\begin{eqnarray*}
P(Y=y)&=&P[\textrm{a particular sequence of $y$1's and
$(n-y)$0's}]\times\\ &&\textrm{Number of such sequences}\\
&=&\left(\begin{array}{c}n\\y\end{array}\right)\pi^y(1-\pi)^{n-y},\
y=1,2,\ldots,n-1.
\end{eqnarray*}

The random variable $Y$ is said to follow a \textit{binomial
distribution} since the terms of its probability function derive
from the binomial expansion of $[\pi+(1-\pi)]^n$. If $0!$ is set,
by convention, to one, then the probability function of $Y$ can be
written as

$$P_Y(y)=\left(\begin{array}{c}n\\y\end{array}\right)\pi^y(1-\pi)^{n-y},\
y=0,1,\ldots,n,$$

since this then gives the correct probabilities for the cases
$y=0$ and $y=n$.

\item[Notation]
If $Y$ has this probability function, then we write $Y\sim
b(n\pi)$.

\item[Distribution function]
The cumulative distribution function $F_Y(y)=P(Y\leq y)$ is given
by

$$F_Y(y)=\sum_{r=0}^yp_Y(r)=\sum_{r=0}^y
\left(\begin{array}{c}n\\r\end{array}\right) \pi^r(1-\pi)^r,$$

which cannot be simplified further.

\item[Example]
The probability that a child is born with an inherited disease
(cystic fibrosis), given that both parents are normal carriers of
the associated gene, is $\frac{1}{4}$. If $Y$ is the number of
affected children in a family of six children, then $Y\sim
b\left(6,\frac{1}{4}\right)$. Hence, the probability if two
affected children is given by

$$P_Y(2)=\left(\begin{array}{c}6\\2\end{array}\right)=
\left(\frac{1}{4}\right)^2\left(\frac{3}{4}\right)^4=15\times\frac{3^4}{4^6}=0.2966.$$

Further,

\begin{eqnarray*}
P(Y\leq2)&=&p_Y(0)+p_y(1)+p_Y(2)\\
&=&\left(\begin{array}{c}6\\0\end{array}\right)
\left(\frac{1}{4}\right)^0\left(\frac{3}{4}\right)^6+
\left(\begin{array}{c}6\\1\end{array}\right)
\left(\frac{1}{4}\right)^1\left(\frac{3}{4}\right)^5
+\left(\begin{array}{c}6\\2\end{array}\right)\left(\frac{1}{4}\right)^2
\left(\frac{3}{4}\right)^4\\ &=&0.1780+0.3560+0.2966=0.8306.
\end{eqnarray*}

\item[Tables]

\begin{description}

\item[(i)]
If $Y\sim b(10, 0.45)$, then $P(Y\leq3)=0.2660$,

\item[(ii)]
If $Y\sim b(16,0.32)$, then
$P(Y=6)=P(Y\leq6)=P(Y\leq5)=0.7743=0.5926=0.1817$,

\item[(iii)]
If $Y\sim b(13,0.18)$, then
$P(\geq4)=1-P(Y\leq3)=1-0.8061=0.1939$,

\item[(iv)]
If $T\sim b(17,0.403)$, then, by linear interpolation,
$P(Y\leq5)=0.2639+0.3(0.2372-0.2639)=0.2639-0.0080=0.2559$.

\end{description}

\item[Properties]

\begin{enumerate}

\item
$\displaystyle\sum_{y=0}^n\left(\begin{array}{c}n\\y\end{array}\right)\pi^y(1-\pi
)^{n-y}=[\pi+(1-\pi)]^n=1$,

\item
Let $Y'=n-Y$, where $Y\sim b(n\pi)$. Then

\begin{eqnarray*}
P(Y'=y')&=&P(n-Y=y')=P(Y=n-y')\\
&=&\left(\begin{array}{c}n\\n-y'\end{array}\right)\pi^{n-y'}(1-\pi)^{y'}\\
&=&\left(
\begin{array}{c}n\\y'\end{array}\right)(1-\pi)^{y'}\pi^{n-y'},\
y'=0,1,\ldots,n.
\end{eqnarray*}

So $Y'\sim b(n,1-\pi)$.

This result is often useful if $\pi>\frac{1}{2}$, for which tables
are not generally available, since $p_Y(y)=p_{Y'}(n-y)$ and
$P(Y\leq y)=P(Y'\geq n-y)$, where the success probability for the
distribution of $Y'$ is $1-\pi$.

\item
Suppose $\pi=\frac{1}{2}$. Then

$$P_Y(y)=\left(\begin{array}{c}n\\y\end{array}\right)\left(\frac{1}{2}\right)
^n=\left(\begin{array}{c}n\\n-y\end{array}\right)\left(\frac{1}{2}\right)^n=p_Y(n-y)$$

for all values of $Y$. Hence the distribution is symmetric.

\end{enumerate}

\item[Tables (continued)]

\begin{description}

\item[(v)]
If $Y\sim b(14,0.68)$, then $P(Y\leq9)=P(Y'\geq5)=1-P(Y'\leq4)$,
where $Y'\sim b(19, 0.32)$. So $P(Y\leq9)=1-0.5187=0.4813$.

\end{description}

\item[Estimation]
Suppose a sequence of $n$ Bernoulli trials yields $y$ successes.
Then the natural, and in many respects the best, estimate of
$\pi$, the success probability, is the observed proportion of
successes $\frac{y}{n}$.

\item[Example (Multiple observations)]
The table below gives, in its second columns, the frequency
distribution of the number $Y$ of peas found in the pod of a
four-seeded line of pea. A total of 269 pods were inspected.

\begin{center}
\begin{tabular}{|c|c|c|c|}
\hline Peas per pod&observed&$\hat{p}_Y(y)$&Expected\\
$y$&frequency of&&frequency of\\ &pods&&pods\\ \hline
0&16&0.0399&10.74\\ \hline 1&45&0.1976&53.15\\ \hline
2&100&0.3666&98.62\\ \hline 3&82&0.3023&81.33\\ \hline
4&26&0.0935&25.15\\ \hline Total&269&0.9999&268.99\\ \hline
\end{tabular}
\end{center}

We will assume that $Y\sim b(4\pi)$ and estimate $\pi$ by the
average proportion of successes per pod, i.e. by

$$\hat{\pi}=\frac{16\left(\frac{0}{4}\right)+45\left(\frac{1}{4}\right)
+82\left(\frac{3}{4}\right)+26\left(\frac{4}{4}\right)}{269}=0.5530.$$

Substituting the value into the probability function of $Y$ yields
the estimated probability function given by

$$\hat{p}_Y(y)=\left(\begin{array}{c}4\\y\end{array}\right)(0.5530)^y(0.4470)^{4-y},\
y=0,1,2,3,4.$$

The values of this function are shown in the third columns of the
table. multiplying them by 269 gives the expected frequencies, for
$y=0,1,2,3,4$, which may be compared with the observed frequencies
to determine how good a fit the binomial distribution is to the
data. These values are shown in the last column of the table.


\end{description}

\end{document}
