# Dave Abercrombie, aberdave.blogspot.com # June 11 2011 # # Demonstrates use of the Q-Q plot used to assess the fit # of sampled data to reference distributions. This example # uses pseudo-random generated log-normal data. The Q-Q plots # make it very easy to see its poor fit with a normal # distribution, and its excellent fit after log transform. ls() # see if we have any stuff lying around # ################################################### # Quantile-Quantile (Q-Q) plots provide graphical # comparison of sampled data to know distributions. # It's a good idea to normalize the sampled data by # subtracting its mean and dividing by its standard # deviation. This is sometimes called z-score # normalization or standardization. In R, we use # the znorm() function to do this, and it is in the # dprep package. But rather than installing and loading # that package, I define a simple version here. # z.std.f <- function(x) { (x-mean(x))/sd(x) } # Randomly sample 300 times from a normal distribution # (default mean = 0, standard deviation = 1). Apply the # exponential function to generate a log normal dataset. # log.normal.data <- exp(rnorm(300)) # Standardize the data to mean=0, sd=1 (a z-score) log.normal.data.z <-z.std.f(log.normal.data) # Better and easier than a histogram png("log-normal.QQ-plot.density-estimate.png", width=432, height=432, units="px", res=72 ) plot( density(log.normal.data.z), main="Density estimate of log.normal.data.z" ) dev.off() # Prepare to do a Q-Q plot. We want to graphically # compare the sample quantiles to the expected # quantiles. If samples were taken from a normal # distribution, the points would line up with a # slope of 1 and and intercept of zero. Significant # deviations from this line indicate a lack of fit to # the distribution. # # We see significant deviations from the line, so # we discard the notion that the samples came # from a normal distribution. # png("log-normal.QQ-plot.normal-bad-fit.png", width=432, height=432, units="px", res=72 ) plot( qqnorm(log.normal.data.z), main="Normal Q-Q log.normal.data", xlab="Theoretical Quantiles", ylab="Sample Quantiles", ylim=c(-6,6) #hardcoded for these data ) abline(0,1) # see also qqline() dev.off() # ################################################### # Q-Q plot for log transformed data # # Lets try a log() transform of these data to # see if they came from a log-normal distribution # The sampled data fall neatly along the line, so they # seem to have been sampled from a log-normal distribution. # log.normal.data.log.z <-z.std.f(log(log.normal.data)) png("log-normal.QQ-plot.log-normal-good-fit.png", width=432, height=432, units="px", res=72 ) plot( qqnorm(log.normal.data.log.z), main="Log Normal Q-Q log(log.normal.data)", xlab="Theoretical Quantiles", ylab="Sample Quantiles" ) abline(0,1) # see also qqline() dev.off()