Markov Chain Monte Carlo Simulation

LeeSungRyeong

Bernoulli likelihood와 beta prior를 이용한 성공의 확률 theta의 사후분포 \[Beta(x+\alpha,N-x+\beta)\]에서 자료를 생성하여 히스토그림을 그려보자. 단, \(\alpha= \beta=5\)이고, \(N=10, x=8\)이다.

사후분포가 \(Beta(13,7)\)이므로 Beta 분포에서 자료를 50000개 생성하여 히스토그램을 그리면 다음과 같다.

B.10000 = rbeta(10000,13,7)
hist(B.10000,freq=F,breaks=100,xlim=c(0,1),ylim=c(0,5))
x=seq(from=0,to=1,by=0.01)
lines(x,dbeta(x,13,7),type="l")

아래 MH.bin는 Bernoulli likelihood와 beta prior를 이용한 성공의 확률 theta의 사후분포 \[Beta(x+\alpha,N-x+\beta)\]에서 자료를 생성하는 Metropolis algorithm을 구현한 함수이다.

자료를 50, 500, 5000, 50000으로 하여 사후분포의 히스토그램을 그리고 실제 분포와 비교한다. 자료의 수가 50에서 50000으로 증가할수록 실제 분포와 매우 가까워짐을 확인할 수 있다.

MH.bin = function(n)
{

  for (i in 1:n){

    theta.pro = runif(1)

    a1 = dbeta(theta.pro,alpha,beta) * dbinom(x,size=N,prob=theta.pro)
    a0 = dbeta(theta.cur,alpha,beta) * dbinom(x,size=N,prob=theta.cur)
    
    p.move = min(a1/a0,1)
    
    u = runif(1)
    
    if (p.move >= u) {
      
      theta.gen = c(theta.gen,theta.pro)
      theta.cur = theta.pro
      
    } else if (p.move < u) {
      theta.gen = c(theta.gen,theta.cur) 
    }
    
  }
  
  return(theta.gen)
  
}


alpha=5;beta=5  ## prior distribution
N=10;x=8    ## likelihood

par(mfrow=c(2,2))

n = 50  ## 50개를 생성함.

theta.gen = NULL
theta.cur = 0.5 # 처음 위치는 0.5

theta.gen = MH.bin(n)  ## MC 알고리즘을 이용하여 사후 분포의 자료를 50개 생성한다. 

hist(theta.gen,freq=F,breaks=10,ylim=c(0,5),xlim=c(0,1))
y=seq(from=0,to=1,by=0.01)
lines(y,dbeta(y,(x+alpha),(N-x+beta)),type="l")

n = 500 ## 500개를 생성함.

theta.gen = NULL
theta.cur = 0.5

theta.gen = MH.bin(n)

hist(theta.gen,freq=F,breaks=20,ylim=c(0,5),xlim=c(0,1))
y=seq(from=0,to=1,by=0.01)
lines(y,dbeta(y,(x+alpha),(N-x+beta)),type="l")

n = 5000  ## 5000개를 생성함.

theta.gen = NULL
theta.cur = 0.5

theta.gen = MH.bin(n)

hist(theta.gen,freq=F,breaks=50,ylim=c(0,5),xlim=c(0,1))
y=seq(from=0,to=1,by=0.01)
lines(y,dbeta(y,(x+alpha),(N-x+beta)),type="l")

n = 50000  ## 50000개를 생성함.

theta.gen = NULL
theta.cur = 0.5

theta.gen = MH.bin(n)

hist(theta.gen,freq=F,breaks=100,ylim=c(0,5),xlim=c(0,1))
y=seq(from=0,to=1,by=0.01)
lines(y,dbeta(y,(x+alpha),(N-x+beta)),type="l")

\(X\sim B(10,\theta)\)이고 모수 \(\theta\)의 사전밀도함수 \(p(\theta)\)는 다음과 같다. \[p(\theta) = \frac{1}{\sqrt{2 \pi}} \frac{1}{ \theta (1- \theta)} e^{- \frac{1}{2} \left \{ log \left( \frac{ \theta}{1- \theta} \right ) \right \}^2 }, ~~ 0 < \theta < 1.\] 이는 \(\rho = log \left ( \frac{\theta}{1-\theta} \right )\)가 표준정규분포를 따르는 경우에 해당한다. \(x=5\)가 관측되었을 때 MCMC 방법으로 \(\theta\)의 사후기대값을 구하여라.

사후밀도함수는 \(p(\theta|x) = C p(\theta) \theta^x (1-\theta)^{10-x}\)인데, 사전분포가 공액사전분포인 베타분포가 아니므로 사후분포의 계산이 매우 어렵다.

후보생성함수 (proposal distribution)는 \(U(0,1)\)으로 선택하고 \[p_{move} = min \left \{ \frac{p(\theta_{pro}) p(x|\theta_{pro})}{p(\theta_{cur}) p(x|\theta_{cur})}, 1 \right \}\]로 얻어진다.

다음은 R로 구현한 자료 발생 알고리즘과 사후기대값 계산 결과이다.

prior.d = function(theta)
{
  temp1 = theta*(1-theta)*sqrt(2*pi)
  temp2 = - 0.5 * ( log (theta/(1-theta))  )^2
  return(exp(temp2)/temp1)
}

MH.bin.2 = function(n,x)
{
  
  for (i in 1:n){
    
    theta.pro = runif(1)
    
    a1 = prior.d(theta.pro) * dbinom(x,size=N,prob=theta.pro)
    a0 = prior.d(theta.cur) * dbinom(x,size=N,prob=theta.cur)
    
    p.move = min(a1/a0,1)
    
    u = runif(1)
    
    if (p.move >= u) {
      
      theta.gen = c(theta.gen,theta.pro)
      theta.cur = theta.pro
      
    } else if (p.move < u) {
      theta.gen = c(theta.gen,theta.cur) 
    }
    
  }
  
  return(theta.gen)
  
}

N = 10; x = 5    # data

n = 5000  # number of random numbers from posterior

theta.gen = NULL
theta.cur = 0.5

theta.gen = MH.bin.2(n,x)

hist(theta.gen,freq=F,breaks=50)
lines(density(theta.gen),col=2)

ans = mean(theta.gen)
ans

## [1] 0.4938919

n = 50000  # number of random numbers from posterior

theta.gen = NULL
theta.cur = 0.5

theta.gen = MH.bin.2(n,x)

hist(theta.gen,freq=F,breaks=100)
lines(density(theta.gen),col=2)

ans = mean(theta.gen)
ans

## [1] 0.4993383

Uniform distribution as proposal distribution

확률변수 \(X\)가 모수 \(\theta\)인 포아송분포를 따른다고 하고, \(\theta\)의 사전분포가 \(\theta \sim Gamma(10,5)\)라고 하자. \(x=10\)이 관측되었을 때, Metropolis algorithm을 이용하여 사후분포를 그리고, 실제 사후분포와 비교하시오. 이 때, 표본의 수는 n=50, 500, 5000, 50000으로 하여 생성하시오. Hint: rgamma과 rpois 함수를 이용하시오.

MH.poi = function(n,x)
{
  
  for (i in 1:n){
    
    theta.pro = runif(1, min=0, max=100)    ## U(0,100)을 후보생성함수로 사용한다.
   
    a1 = dgamma(theta.pro,shape=alpha,scale=beta) * dpois(x,theta.pro)
    a0 = dgamma(theta.cur,shape=alpha,scale=beta) * dpois(x,theta.cur)
  
    p.move = min(a1/a0,1)
    
    u = runif(1)
    
    if (p.move >= u) {
      
      theta.gen = c(theta.gen,theta.pro)
      theta.cur = theta.pro
      
    } else if (p.move < u) {
      theta.gen = c(theta.gen,theta.cur) 
    }
    
  }
  
  return(theta.gen)
  
}


alpha=10;beta=5  # prior parameter values
x=10   # data

par(mfrow=c(2,2))

n = 50  # number of random numbers from posterior

theta.gen = NULL
theta.cur = 1.0

theta.gen = MH.poi(n,x)

hist(theta.gen,freq=F,breaks=10,xlim=c(0,50),ylim=c(0,0.12))
y=seq(from=0,to=50,by=0.01)
lines(y,dgamma(y,shape=(x+alpha),scale=1/(1+1/beta)),type="l",col=2)

n = 500  # number of random numbers from posterior

theta.gen = NULL
theta.cur = 1.0

theta.gen = MH.poi(n,x)

hist(theta.gen,freq=F,breaks=20,xlim=c(0,50),ylim=c(0,0.12))
y=seq(from=0,to=50,by=0.01)
lines(y,dgamma(y,shape=(x+alpha),scale=1/(1+1/beta)),type="l",col=2)

n = 5000  # number of random numbers from posterior

theta.gen = NULL
theta.cur = 1.0

theta.gen = MH.poi(n,x)

hist(theta.gen,freq=F,breaks=50,xlim=c(0,50),ylim=c(0,0.12))
y=seq(from=0,to=50,by=0.01)
lines(y,dgamma(y,shape=(x+alpha),scale=1/(1+1/beta)),type="l",col=2)

n = 50000  # number of random numbers from posterior

theta.gen = NULL
theta.cur = 1.0

theta.gen = MH.poi(n,x)

hist(theta.gen,freq=F,breaks=100,xlim=c(0,50),ylim=c(0,0.12))
y=seq(from=0,to=50,by=0.01)
lines(y,dgamma(y,shape=(x+alpha),scale=1/(1+1/beta)),type="l",col=2)

Metropolis-Hastings algorithm: Chi-Square distribution as proposal distribution

후보생성함수로 5.1절의 균일분포(uniform distribution) 대신 카이제곱 (Chi-square) 분포를 이용해 보자.

카이제곱 (Chi-square) 후보생성밀도함수를 q(\(\theta\))라 하면 이동확률은\[p_{move} = min \left \{ \frac{p(\theta_{pro}) p(x|\theta_{pro}) q(\theta_{cur})}{p(\theta_{cur}) p(x|\theta_{cur}) q(\theta_{pro})}, 1 \right \}\]로 구하여진다.

MH.poi.2 = function(n,x)
{
  
  for (i in 1:n){
    
    theta.pro = rchisq(1, 18)
   
    a1 = dgamma(theta.pro,shape=alpha,scale=beta) * dpois(x,theta.pro) * dchisq(theta.cur,18)
    a0 = dgamma(theta.cur,shape=alpha,scale=beta) * dpois(x,theta.cur) * dchisq(theta.pro,18)
  
    p.move = min(a1/a0,1)
    
    u = runif(1)
    
    if (p.move >= u) {
      
      theta.gen = c(theta.gen,theta.pro)
      theta.cur = theta.pro
      
    } else if (p.move < u) {
      theta.gen = c(theta.gen,theta.cur) 
    }
    
  }
  
  return(theta.gen)
  
}


alpha=10;beta=5  # prior parameter values
x=10   # data

par(mfrow=c(2,2))

n = 50  # number of random numbers from posterior

theta.gen = NULL
theta.cur = 1.0

theta.gen = MH.poi.2(n,x)

hist(theta.gen,freq=F,breaks=10,xlim=c(0,50),ylim=c(0,0.12))
y=seq(from=0,to=50,by=0.01)
lines(y,dgamma(y,shape=(x+alpha),scale=1/(1+1/beta)),type="l",col=2)

n = 500  # number of random numbers from posterior

theta.gen = NULL
theta.cur = 1.0

theta.gen = MH.poi.2(n,x)

hist(theta.gen,freq=F,breaks=20,xlim=c(0,50),ylim=c(0,0.12))
y=seq(from=0,to=50,by=0.01)
lines(y,dgamma(y,shape=(x+alpha),scale=1/(1+1/beta)),type="l",col=2)

n = 5000  # number of random numbers from posterior

theta.gen = NULL
theta.cur = 1.0

theta.gen = MH.poi.2(n,x)

hist(theta.gen,freq=F,breaks=50,xlim=c(0,50),ylim=c(0,0.12))
y=seq(from=0,to=50,by=0.01)
lines(y,dgamma(y,shape=(x+alpha),scale=1/(1+1/beta)),type="l",col=2)

n = 50000  # number of random numbers from posterior

theta.gen = NULL
theta.cur = 1.0

theta.gen = MH.poi.2(n,x)

hist(theta.gen,freq=F,breaks=100,xlim=c(0,50),ylim=c(0,0.12))
y=seq(from=0,to=50,by=0.01)
lines(y,dgamma(y,shape=(x+alpha),scale=1/(1+1/beta)),type="l",col=2)

Introduction to Gibbs sampler

정규분포와 분산에 대하여 공액사전분포를 사용한 경우에 대하여는 강의록 10장에서 설명하였다.

반공액사전분포 \(p(\mu) \sim N(a,b^2)\)와 \(\sigma^2 \sim IG(\alpha,\beta)\)를 사용하는 경우에는 형태가 매우 복잡해 진다. 그러나, 다음 두 가지 결론을 쉽게 얻을 수 있다. \[p(\sigma^2 | {\bf x} , \mu) \sim IG \left (\alpha+ \frac{n}{2}, \beta + \frac{1}{2} \sum_{i=1}^n (x_i - \mu)^2 \right ) \] \[p(\mu | {\bf x}, \sigma^2 ) \sim N \left ( \frac{\frac{\sigma^2}{n}a + b^2 \bar x } { \frac{\sigma^2}{n} + b^2 }, \left \{ \frac{1}{b^2} + \frac{n}{\sigma^2} \right \}^{-1} \right ).\] 따라서, \(\mu\)와 \(\sigma^2\)가 각각 주어졌을 때 조건부분포로부터의 표본추출이 용이함을 알 수 있다.

이를 일반화시켜 모수 \({\boldsymbol \theta} = (\theta_1 , \theta_2, \theta_3)\)의 각 원소 모수의 완전조건부 사후분포 \(p(\theta_1|{\bf x},\theta_2,\theta_3)\), \(p(\theta_2|{\bf x},\theta_1,\theta_3)\), \(p(\theta_3|{\bf x},\theta_1,\theta_2)\)로부터의 표본추출이 용이한 경우를 생각해 보자. 위의 조건부사후분포는 관심모수를 제외한 나머지가 모두 주어진 조건이므로 완전조건부 사후분포 (full conditional posterior distribution)라고 한다. 이렇듯 결합 분포는 복잡하지만 완전 조건부분포가 각각 편리한 형태로 주어지는 경우는 여러 다변량모수의 경우에 빈번히 발생한다.

이 경우 매우 쉽게 적용할 수 있는 깁스 표본기법의 알고리즘은 다음과 같다.

(Step 0) \(\theta_1, \theta_2, \theta_3\)의 초기치 \(\theta_1^{(0)},\theta_2^{(0)},\theta_3^{(0)}\)를 정한다.
(Step 1) \(\theta_1^{(1)} \sim p(\theta_1 | x,\theta_2^{(0)},\theta_3^{(0)})\), \(~~\theta_2^{(1)} \sim p(\theta_2 | x,\theta_1^{(1)},\theta_3^{(0)})\), \(~~\theta_3^{(1)} \sim p(\theta_1 | x,\theta_1^{(1)},\theta_2^{(1)})\).
(Step 2) \(\theta_1^{(2)} \sim p(\theta_1 | x,\theta_2^{(1)},\theta_3^{(1)})\), \(~~\theta_2^{(2)} \sim p(\theta_2 | x,\theta_1^{(2)},\theta_3^{(1)})\), \(~~\theta_3^{(2)} \sim p(\theta_1 | x,\theta_1^{(1)},\theta_2^{(1)})\).
\(\cdots\) (Step m) \(\theta_1^{(m)} \sim p(\theta_1 | x,\theta_2^{(m-1)},\theta_3^{(m-1)})\), \(~~\theta_2^{(m)} \sim p(\theta_2 | x,\theta_1^{(m)},\theta_3^{(m-1)})\), \(~~\theta_3^{(m)} \sim p(\theta_1 | x,\theta_1^{(m)},\theta_2^{(m)})\).

이렇듯 조건에 들어가는 모수값을 가장 최근의 값들로 대체하면서 차례대로 \(\theta_1, \theta_2, \theta_3\)을 완전조건부 사전분포로 추출하면, \(m\)이 충분히 클 때 근사적으로 \(\theta^{(m)}=(\theta_1^{(m)},\theta_2^{(m)},\theta_3^{(m)})\)은 \(p(\theta|x)\) 분포를 따르므로 \(\theta^{(m)}=(\theta_1^{(m)},\theta_2^{(m)},\theta_3^{(m)})\)을 \(\theta\)의 사후표본으로 사용할 수 있다.

Example

\(X_1, \ldots, X_10 \sim N(\mu,\sigma^2)\), \(\mu \sim (\mu_0,\sigma_0^2)\), \(\sigma^2 \sim IG(a,b)\)이고, \((x_1, x_2, \ldots, x_{10}) = (10,13,15,11,9,18,20,17,23,21)\)일 때, \(\theta = (\mu,\sigma^2)\)에 대해 깁스 표본기법을 적용해 보자.

library(pscl)


M=3000; m=500
mu0=10; sig20=25; a=1; b=1
x = c(10,13,15,11,9,18,20,17,23,21)
n = length(x)
xbar = mean(x); var.x=var(x)
THETA = matrix(nrow=M,ncol=2)

# Initial value of sig2
sig2 = var.x

# simulation

for (nsim in 1:M) { 

# generate mu

condpost.mu = (sig2/n*mu0+sig20*xbar)/(sig2/n+sig20)
condpost.var = 1/(1/sig20+n/sig2)
mu = rnorm(1,mean=condpost.mu,sd=sqrt(condpost.var))

# generate sig2

condpost.a = a+n/2
condpost.b = b+1/2*((n-1)*var.x+n*(xbar-mu)^2)
sig2 = rigamma(1,alpha=condpost.a,beta=condpost.b)

# save

THETA[nsim,] = c(mu,sig2)

}


## 표본의 이동경로로서 왼쪽부터 5회, 15회, 100회까지의 경로 

par(mfrow=c(1,3))

plot(THETA[1:5,], type="n", xlab=expression(mu), ylab=expression(sigma^2))
lines(THETA[1:5,],lty=2)
for(i in 1:5){
  text(THETA[i,1],THETA[i,2],i)
}

plot(THETA[1:15,], type="n", xlab=expression(mu), ylab=expression(sigma^2))
lines(THETA[1:15,],lty=2)
for(i in 1:15){
  text(THETA[i,1],THETA[i,2],i)
}

plot(THETA[1:100,], type="n", xlab=expression(mu), ylab=expression(sigma^2))
lines(THETA[1:100,],lty=2)
for(i in 1:100){
  text(THETA[i,1],THETA[i,2],i)
}

## 수렴시점 (m=500) 후의 표본의 산점도

par(mfrow=c(1,1))
plot(THETA[m:M,1],THETA[m:M,2], xlab=expression(mu), ylab=expression(sigma^2), main="samples of (mu,sigma^2) " )

## 수렴시점 (m=500) 후의 $\mu$와 $\sigma^2$의 주변사후밀도함수 $p(\mu|x)$와 $p(\sigma^2|x)$

par(mfrow=c(1,2))
plot(density(THETA[m:M,1]), xlab=expression(mu), ylab="posterior", main="marginal posterior of mu")
plot(density(THETA[m:M,2]), xlab=expression(sigma^2), ylab="posterior", main="marginal posterior of sigma^2")

'분석 R > 구현' 카테고리의 다른 글

[R][Windows 10] R 4.0 에서 reticulate를 사용하여 conda에서 Tensorflow 설치해보기 (0)	2020.05.04
[ R ] RandomForest tree visualization (0)	2019.10.29
[R] ggplot Geom_Bar Tips (0)	2019.10.20
[R] ggwordcloud example (0)	2019.09.29
[ R ] AUCROC and KS (H2O) (0)	2019.03.16

Markov Chain Monte Carlo Simulation

LeeSungRyeong

Uniform distribution as proposal distribution

Metropolis-Hastings algorithm: Chi-Square distribution as proposal distribution

Introduction to Gibbs sampler

Example

'분석 R > 구현' 카테고리의 다른 글

AI 도구

AI 도구 사이드 패널

티스토리툴바