###################
#A brief demonstration of independent measures boostraps.
#NOTE that these are independent measures, which isn't really what we would use, so they are just for illustration
##################
library(dplyr)
library(tidyr)
#read in the data
d= read.csv("results.no.outliers.csv")
######################
#An inpendent measures, non-parametric bootstrap test, using permute() and sample()
######################
#Let's do this for differences between means of the two "long" sentences
wh=subset(d, dependencyLength == "lg")
wh = droplevels(wh)
#Step 1: combine the two samples together into one sample to act as an approximation of the population
population=wh$zscores
#Step 2: sample with replacement 9999 times
#we create one less than the number we want so that we can add the correction factor of +1 at the end
#notice that we sample the full 204. This is because we can simply use the first 102 as the control group, and the second 102 as the target group.
#Notice also that this is clearly an independent measures test, since we are treating each of the 204 values as independent from each other!
boot.replicates = replicate(n=9999, expr=sample(x=population, size=112, replace=T ))
#Step 3:
#create a function to calculate mean differences. This treats the first 102 as the control group, and the last 102 as the target group
mean.diff <- function(dataset){
mean(dataset[1:56], na.rm=T) - mean(dataset[57:112], na.rm=T)
}
#Step 4:
#now apply this new function to all of the permutations we created above
diffs = apply(X=boot.replicates, MARGIN=2, FUN=mean.diff)
#Step 5: calculate the p-value
#first we calculate our observed difference
means = wh %>%
group_by(embeddedStructure) %>%
summarize(m = mean(zscores, na.rm=TRUE)) %>%
ungroup()
observed.diff=as.numeric(means[2,2] - means[1,2])
#Then we count the number of results in our simulation that are equal to or greater than our observed difference, and add 1 as the correction factor
numerator=length(diffs[diffs>=observed.diff])+1
#Then we count the number of simulations we ran, and add 1 as a correction factor
denominator=length(diffs)+1
#Finally we put them together to calculate the p-value
p=numerator/denominator
###Note that you can also use the boot package in R to do bootstraps, but I like calculating them by hand
######################
#An inpendent measures, parametric bootstrap test, using permute() and sample()
######################
#Let's do this for differences between means of the two "long" sentences
wh=subset(d, dependencyLength == "lg")
wh = droplevels(wh)
#Step 1: rescale the observed data to be in a standard normal distribution
#R gives a nice built-in function called scale() that calculates z-scores for a vector!
wh.scaled = wh %>%
mutate(scaled = scale(zscores))
#Note that in this test our population is infinite, and it is defined by a probability function rnorm(). So we don't need to define it ahead of time, we can just plop the functon into our resampling code!
#Step 2: sample with replacement 9999 times
#we create one less than the number we want so that we can add the correction factor of +1 at the end
#notice that we sample the full 204. This is because we can simply use the first 102 as the control group, and the second 102 as the target group.
#Notice also that this is clearly an independent measures test, since we are treating each of the 204 values as independent from each other!
boot.replicates = replicate(n=9999, expr=rnorm(n=112, mean=0, sd=1))
#Step 3:
#create a function to calculate mean differences. This treats the first 102 as the control group, and the last 102 as the target group
mean.diff <- function(dataset){
mean(dataset[1:56], na.rm=T) - mean(dataset[57:112], na.rm=T)
}
#Step 4:
#now apply this new function to all of the bootstraps we created above
diffs = apply(X=boot.replicates, MARGIN=2, FUN=mean.diff)
#Step 5: calculate the p-value
#first we calculate our observed difference
means = wh.scaled %>%
group_by(embeddedStructure) %>%
summarize(m = mean(scaled, na.rm=TRUE)) %>%
ungroup()
observed.diff=as.numeric(means[2,2] - means[1,2])
#Then we count the number of results in our simulation that are equal to or greater than our observed difference, and add 1 as the correction factor
numerator=length(diffs[diffs>=observed.diff])+1
#Then we count the number of simulations we ran, and add 1 as a correction factor
denominator=length(diffs)+1
#Finally we put them together to calculate the p-value
p=numerator/denominator
###Note that you can also use the boot package in R to do bootstraps, but I think it is better to do the first few by hand before using a package to do it.