# Dave Abercrombie, aberdave.blogspot.com
# June 11 2011
#
# Demonstrates use of the Q-Q plot used to assess the fit
# of sampled data to reference distributions. This example
# uses pseudo-random generated log-normal data. The Q-Q plots
# make it very easy to see its poor fit with a normal
# distribution, and its excellent fit after log transform.
ls() # see if we have any stuff lying around
# ###################################################
# Quantile-Quantile (Q-Q) plots provide graphical
# comparison of sampled data to know distributions.
# It's a good idea to normalize the sampled data by
# subtracting its mean and dividing by its standard
# deviation. This is sometimes called z-score
# normalization or standardization. In R, we use
# the znorm() function to do this, and it is in the
# dprep package. But rather than installing and loading
# that package, I define a simple version here.
#
z.std.f <- function(x) { (x-mean(x))/sd(x) }
# Randomly sample 300 times from a normal distribution
# (default mean = 0, standard deviation = 1). Apply the
# exponential function to generate a log normal dataset.
#
log.normal.data <- exp(rnorm(300))
# Standardize the data to mean=0, sd=1 (a z-score)
log.normal.data.z <-z.std.f(log.normal.data)
# Better and easier than a histogram
png("log-normal.QQ-plot.density-estimate.png",
width=432, height=432, units="px", res=72 )
plot( density(log.normal.data.z),
main="Density estimate of log.normal.data.z"
)
dev.off()
# Prepare to do a Q-Q plot. We want to graphically
# compare the sample quantiles to the expected
# quantiles. If samples were taken from a normal
# distribution, the points would line up with a
# slope of 1 and and intercept of zero. Significant
# deviations from this line indicate a lack of fit to
# the distribution.
#
# We see significant deviations from the line, so
# we discard the notion that the samples came
# from a normal distribution.
#
png("log-normal.QQ-plot.normal-bad-fit.png",
width=432, height=432, units="px", res=72 )
plot(
qqnorm(log.normal.data.z),
main="Normal Q-Q log.normal.data",
xlab="Theoretical Quantiles",
ylab="Sample Quantiles",
ylim=c(-6,6) #hardcoded for these data
)
abline(0,1) # see also qqline()
dev.off()
# ###################################################
# Q-Q plot for log transformed data
#
# Lets try a log() transform of these data to
# see if they came from a log-normal distribution
# The sampled data fall neatly along the line, so they
# seem to have been sampled from a log-normal distribution.
#
log.normal.data.log.z <-z.std.f(log(log.normal.data))
png("log-normal.QQ-plot.log-normal-good-fit.png",
width=432, height=432, units="px", res=72 )
plot(
qqnorm(log.normal.data.log.z),
main="Log Normal Q-Q log(log.normal.data)",
xlab="Theoretical Quantiles",
ylab="Sample Quantiles"
)
abline(0,1) # see also qqline()
dev.off()