library(muhaz)
kphaz.plot <- function(fit, minx=0, miny=0, maxx=0, maxy=0, step=FALSE, smooth=FALSE, points=FALSE, ...)
{
########################################################################
#   "kphaz.plot" is an S function plots the K-M-type hazard
#   function estimate generated by kphaz.fit.
#   Required Arguments:
#   fit: results of a call to kphaz.fit.
#   ...: additional arguments for the plot function
########################################################################
#
# Make sure time and haz exist and are the same length
#
	if(any(names(fit) == "time") & (any(names(fit) == "haz"))) {
		time <- fit$time
		haz <- fit$haz
	}
	else stop("Arguement \"fit\" must be the result of a call to \"kphaz.fit\""
			)	#
	if(length(time) != length(haz)) stop(
			"Arguement \"fit\" must be the result of a call to \"kphaz.fit\""
			)	#
# Check to see if there are any strata
#
	qstrata <- any(names(fit) == "strata")
	if(qstrata)
		strata <- fit$strata
	else strata <- rep(1, length(time))
	if(length(strata) != length(haz)) stop(
			"Arguement \"fit\" must be the result of a call to \"kphaz.fit\""
			)	#
# Define, ustrata, the number of unique strata
#
	ustrata <- unique(strata)
	good <- 1:length(ustrata)
	for(i in 1:length(ustrata)) {
		cur.strata <- ustrata[i]
		ind <- strata == ustrata[i]
		if(all(is.na(haz[ind] | is.nan(haz[ind]))))
			good <- good[good != i]
	}
	ustrata <- ustrata[good]	#

# Check to see if there are any plots to be made
#
	if(length(ustrata) < 1) stop("No plots")	#
# Make the first plot
#

# Gary Feng: add maxx and maxy
	ind <- (strata == ustrata[1]) & (!is.nan(haz))
#	xmax <- max(time)
#	ymax <- max(haz[((!is.nan(haz)) & (!is.na(haz)))])
	if(missing(maxx) | maxx==0){
		xmax <- max(time)
	} else {
		xmax <-maxx
	}
	
	if(missing(maxy) | maxy==0) {
		ymax <- max(haz[((!is.nan(haz)) & (!is.na(haz)))])
	} else {
		ymax <- maxy
	}
## End Gary Feng

	x <- time[ind]
	y <- haz[ind]
	if(min(x) > 0) {
		x <- c(0, x)
		y <- c(0, y)
	}

# Gary Feng: empty plot for now
#	plot(x, y, xlim = c(0, xmax), ylim = c(0, ymax), xlab = "Time",
#		ylab = "Hazard", type = "s", ...)	#
	plot(x, y, xlim = c(minx, xmax), ylim = c(miny, ymax), xlab = "Time",
		ylab = "Hazard", type = "n", ...)	#
		
# Make any subsequent plots
#
#	if(length(ustrata) > 1) {
#		for(i in 2:length(ustrata)) {
	if(length(ustrata) >= 1) {
		for(i in 1:length(ustrata)) {
#			ind <- strata == ustrata[i]
			ind <- (strata == ustrata[i]) & (!is.nan(haz)) & (!is.na(haz))
			x <- time[ind]
# Gary Feng: the function gives error message:
# Error in stepfun(x, y) : 'y' must be one longer than 'x'

# adding a zero at the beginning
			if (step==TRUE) {
				y <- c(0, haz[ind])
			} else {
				y <- haz[ind]
			}

			if(min(x) > 0) {
				x <- c(0, x)
				y <- c(0, y)
			}

# Gary Feng: use lines, not steps
			if (step==TRUE) {
				lines(stepfun(x, y), lty = i, col=i)
			} else {
				if (points) points(x,y,col=i)
# Gary Feng: Smooth data if needed
				if (smooth==TRUE) {
					y<-smooth(y)
				}
				lines(x,y, col=i, lty = i)
#				lines(x,y, col="light gray", lty = 1)
			}
# end Gary Feng

		}
	}
#
# Return
#
	invisible()
}
# kphaz.plot(out, maxy=0.02, smooth=TRUE)
###############################################


library(segmented)

###############################################
# segmented Regression Analysis
#
# function(
#    dur: fixation duration vector
#    initpara: initial guesses of where the change points are
#    maxtime: end point of the hazard rate
#    mintime: starting point of the hazard rate
#    binwidth: force to round original data
###############################################
segreg <-function(fixdur, initpara, maxtime=600, mintime=50, binwidth=1) {

###################################
# estimate the hazard rate
# bin width 
fixdur<-round(fixdur/binwidth)*binwidth
status<-rep(1,length(fixdur))
out1<-kphaz.fit(fixdur,status)
print(out1)

###################################
# Now do the segmented regression
ind<-(out1$time<=maxtime & out1$time>=mintime & !is.na(out1$haz))
d<-data.frame( x=out1$time[ind], y=out1$haz[ind])
h.lm<-lm(y~x,data=d)
summary(h.lm)

try(h.seg1<-segmented(h.lm,seg.Z=~x,psi=list(x=initpara),control=seg.control(display=FALSE,h=0.5)), silent=FALSE)
traceback()

#print(summary(h.seg1))
#print(slope(h.seg1))

#################
# plotting setup
par(mfrow=c(1, 1))
plot(out1$time, out1$haz, , xlim=c(50,maxtime), ylim=c(-.005,0.03),xlab="Fixation Duration", 
	ylab="Hazard rate/Prob. density",type="n")

##################
# plotting
##################
i=2
lines(out1$time, out1$hazraw, col="light gray")
lines(d$x, smooth(d$y), col=1)
plot(h.seg1, add=TRUE, col=i)
lines(h.seg1, term="x", col=i, pch=1, k=i*3+10)

# add pdf: use a large high limit for breakpoints, else the pdf will reflect the censoring. 
temp=fixdur[fixdur>0 & fixdur<2000]
fixhist<-hist(temp, 10*(1:200), plot=FALSE)
histtime<-c(0, fixhist$breaks[1:maxtime])
histpdf<-c(0, fixhist$density[1:maxtime])
lines(histtime,histpdf , col="pink")

# add horizontal line at 0,0
lines(c(0,maxtime), c(0,0))

}

# No plot
segreg2 <-function(fixdur, initpara, maxtime=600, mintime=50, binwidth=20) {
###################################
# estimate the hazard rate
# bin width 
fixdur<-round(fixdur/binwidth)*binwidth
status<-rep(1,length(fixdur))
out1<-kphaz.fit(fixdur,status)
#print(out1)

###################################
# Now do the segmented regression
ind<-(out1$time<=maxtime & out1$time>=mintime & !is.na(out1$haz))
d<-data.frame( x=out1$time[ind], y=out1$haz[ind])
h.lm<-lm(y~x,data=d)
summary(h.lm)
print(h.lm)

#try(h.seg1<-segmented(h.lm,seg.Z=~x,psi=list(x=initpara),control=seg.control(display=FALSE,h=0.5)), silent=FALSE)
h.seg1<-segmented(h.lm,seg.Z=~x,psi=list(x=initpara),control=seg.control(display=FALSE,h=0.5))
traceback()
}

#library(foreign)
#read.spss("study123456.sav") ->allfix
# now all data are saved in rdata format
load("study123456.rda")

# testing, 1, 2, 3
fixdur=sample(allfix$dur[!is.na(allfix$dur) & allfix$dur>1 & allfix$dur<1000], 50000, replace=TRUE, prob=NULL)
segreg2(fixdur, initpara=c(180))

###############
# bootstrapping
# k= # of samples, n=sample size, alpha=sig level
###############
results<-NULL
initpara=c(100, 180, 250)
maxtime=600
mintime=50
binwidth=20

k=10000; n=5000

for(i in 1:k)
{
# sample
fixdur=sample(allfix$dur[!is.na(allfix$dur)], 50000, replace=TRUE, prob=NULL)
# get haz	
fixdur<-round(fixdur/binwidth)*binwidth
status<-rep(1,length(fixdur))
out1<-kphaz.fit(fixdur,status)
# seg reg
ind<-(out1$time<=maxtime & out1$time>=mintime & !is.na(out1$haz))
d<-data.frame( x=out1$time[ind], y=out1$haz[ind])
h.lm<-lm(y~x,data=d)
h.seg1<-NULL
try(h.seg1<-segmented(h.lm,seg.Z=~x,psi=list(x=initpara),control=seg.control(display=FALSE,h=0.5)), silent=FALSE)
#try(print(summary(h.seg1)))
#try(print(slope(h.seg1)))

results<-c(results, list(h.seg1))
}

save (results, file="bootstrapping10k.rdata")


p1<-NULL; p2<-NULL; p3<-NULL;
se1<-NULL; se2<-NULL; se3<-NULL;

for(i in 1:k)
{
try(p1[i]<-results[[i]]$"psi"[[1,2]])
try(p2[i]<-results[[i]]$"psi"[[2,2]])
try(p3[i]<-results[[i]]$"psi"[[3,2]])
try(se1[i]<-results[[i]]$"psi"[[1,3]])
try(se2[i]<-results[[i]]$"psi"[[2,3]])
try(se3[i]<-results[[i]]$"psi"[[3,3]])
}


########################
# bootstrapping
########################


bootstat<-function(x,alpha=0.05){
	x=x[!is.na(x) & !is.null(x)]	
	meanboot=mean(x)
    cat("Mean Bootstrap Sample :",meanboot,"\n")
	
	varboot=var(x)
      seboot=sqrt(varboot)
       	 cat("bootstrap standard error =",seboot,"\n")
       	 cat("\n")
	lobound=quantile(x,(alpha/2))
      upbound=quantile(x,(1-(alpha/2)))
 		cat(((1-alpha)*100),"% Confidence Interval for Mean Statistic:","\n")
            cat("lower bound =",lobound,"\n")
            cat("upper bound =",upbound,"\n")
	hist(x, breaks=50)
 }

bootstat(p1)
bootstat(p2)
bootstat(p3)


####################################
# PART II: can segreg detect changepoints in hazard function?

k=1000; m=5000;
segregmc<-NULL
a=1/2; b=1; d=1;

for(i in 1:k)
{
u<-runif(m)
# this is the inversed CDF of the piecewise linear hazard function, 
# with the first slope a, the 2nd slope b, 
# changepoint at d.
v<-ifelse(u<=1-(exp(-a^2*d^2)), 
sqrt(log(1/(1-u)))/a, 
((b^2-a^2)*d+sqrt(a^4*d^2-(a*b*d)^2-b^2*log(1-u)))/b^2)
# plot(v, u)

# now estimate the hazard function; discretize to speed up a bit
w<-round(v*100)/100
status<-rep(1,length(w))
out<-kphaz.fit(w,status)
#kphaz.plot(out, maxy=10, smooth=TRUE)

# drop the hazard tail
maxtime=2; mintime=0;
ind<-(out$time<=maxtime & out$time>=mintime & !is.na(out$haz))
da<-data.frame( x=out$time[ind], y=out$haz[ind])
h.lm<-lm(y~x,data=da)
h.seg1<-NULL
# use a randomized initial value of 1 +/- 1/3. 
initpara=0.75+runif(1)/2
try(h.seg1<-segmented(h.lm,seg.Z=~x,psi=list(x=initpara),control=seg.control(display=FALSE,h=0.5)), silent=FALSE)
#try(print(summary(h.seg1)))
#try(print(slope(h.seg1)))

segregmc<-c(segregmc, list(h.seg1))
}
save (segregmc, file="segregmc10k.rdata")

p1<-NULL; se1<-NULL; 
for(i in 1:k)
{
try(p1[i]<-segregmc[[i]]$"psi"[[1,2]])
try(se1[i]<-segregmc[[i]]$"psi"[[1,3]])
}

bootstat(p1)
bootstat(se1)

i=2
# lines(out$time, out$hazraw, col="light gray")
lines(d$x, smooth(d$y), col=1)
plot(h.seg1, add=TRUE, col=i)
lines(h.seg1, term="x", col=i, pch=1, k=i*3+10)


####################################
# PART III: Jittering
# Dirk Vorberg argued in his review that jittering will mess up the 
# hazard function and implied that the segmented regression will
# not work. Let's test it with the piecewise Weibull with jittering

k=1000; m=5000;
jitter<-NULL
a=1/2; b=1; 
jitter_sd=0.2; #<-- 95% CI gives 0.6-1.4, pretty large
# now let d be a random normal variable
d=rnorm(m,1,jitter_sd);

for(i in 1:k)
{
u<-runif(m)
# this is the inversed CDF of the piecewise linear hazard function, 
# with the first slope a, the 2nd slope b, 
# changepoint at d.
v<-ifelse(u<=1-(exp(-a^2*d^2)), 
sqrt(log(1/(1-u)))/a, 
((b^2-a^2)*d+sqrt(a^4*d^2-(a*b*d)^2-b^2*log(1-u)))/b^2)
# plot(v, u)

# now estimate the hazard function; discretize to speed up a bit
w<-round(v*100)/100
status<-rep(1,length(w))
out<-kphaz.fit(w,status)
if(k==1) kphaz.plot(out, maxy=4, maxx=3, smooth=TRUE)

# drop the hazard tail
maxtime=2; mintime=0;
ind<-(out$time<=maxtime & out$time>=mintime & !is.na(out$haz))
da<-data.frame( x=out$time[ind], y=out$haz[ind])
h.lm<-lm(y~x,data=da)
h.seg1<-NULL
# use a randomized initial value of 1 +/- 1/3. 
initpara=0.75+runif(1)/2
try(h.seg1<-segmented(h.lm,seg.Z=~x,psi=list(x=initpara),control=seg.control(display=FALSE,h=0.5)), silent=FALSE)
try(print(summary(h.seg1)))
try(print(slope(h.seg1)))

jitter<-c(jitter, list(h.seg1))
}
save (jitter, file="jitter.rdata")

p1<-NULL; se1<-NULL; 
for(i in 1:k)
{
try(p1[i]<-jitter[[i]]$"psi"[[1,2]])
try(se1[i]<-jitter[[i]]$"psi"[[1,3]])
}

bootstat(p1)
bootstat(se1)

i=2
# lines(out$time, out$hazraw, col="light gray")
lines(d$x, smooth(d$y), col=1)
plot(h.seg1, add=TRUE, col=i)
lines(h.seg1, term="x", col=i, pch=1, k=i*3+10)