# Dave Abercrombie, aberdave.blogspot.com # June 11 2011 # # Source of data: # http://shallahamer-orapub.blogspot.com/2011/02/sql-statement-elapsed-times.html # http://filezone.orapub.com/Research/sql_elapsed_distribution/AnalysisPack.zip # Garret8qtkxy0g5d1p3_1.txt # after "/usr/bin/cut -c1-108", as if SQL*Plus TRIMSPOOL had been set ls() # see if we have any stuff lying around # change as needed setwd("/home/dave/Documents/R-play/cs/20110203-analysis-pack") # Import white-space delimited text file into a data frame. # Name the columns since the file has no header. garret8q.df <- read.table("Garret8qtkxy0g5d1p3_1.txt") colnames(garret8q.df) <- c( "sql.id.plan.hash.value", "sample.no", "executions", "disk.reads", "buffer.gets", "cpu.time", "elapsed.time" ) # Display the data frame structure and its first few rows # for a visual sanity check. str(garret8q.df) head(garret8q.df) # Calculate basic univariate stats for the elapsed time variable. summary(garret8q.df$elapsed.time) sd(garret8q.df$elapsed.time) # compare this histogram to the (better?) density plot below png("Garret8q.QQ-plot.histogram.png", width=432, height=432, units="px", res=72 ) hist(garret8q.df$elapsed.time, breaks=20) dev.off() # ################################################### # Explore distribution using density plot rather than # histogram: # # "Many statisticians recommend using density plots # instead of histograms because they are more robust # and easier to read." (Adler 2010 p.238). # # So I demonstrate R's density() function below. # # Another exploratory plot is the Empirical Cumulative # Distribution Function (ECDF) which I do not show here. # png("Garret8q.QQ-plot.density-estimate.png", width=432, height=432, units="px", res=72 ) plot( density(garret8q.df$elapsed.time), main="Density estimate of elapsed.time" ) dev.off() # ################################################### # Quantile-Quantile (Q-Q) plots provide graphical # comparison of sampled data to know distributions. # It's a good idea to normalize the sampled data by # subtracting its mean and dividing by its standard # deviation. This is sometimes called z-score # normalization or standardization. In R, we use # the znorm() function to do this, and it is in the # dprep package. But rather than installing and loading # that package, I define a simple version here. # z.std.f <- function(x) { (x-mean(x))/sd(x) } elapsed.time.z <-z.std.f(garret8q.df$elapsed.time) # Prepare to do a Q-Q plot. We want to graphically # compare the sample quantiles to the expected # quantiles. If samples were taken from a normal # distribution, the points would line up with a # slope of 1 and and intercept of zero. Significant # deviations from this line indicate a lack of fit to # the distribution. # # We see significant deviations from the line, so # we discard the notion that the samples came # from a normal distribution. # png("Garret8q.QQ-plot.normal.png", width=432, height=432, units="px", res=72 ) plot( qqnorm(elapsed.time.z), main="Normal Q-Q elapsed.time", xlab="Theoretical Quantiles", ylab="Sample Quantiles" ) abline(0,1) # see also qqline() dev.off() # ################################################### # Q-Q plot for log transformed data # # Lets try a log() transform of these data to see if # they might have come from a log-normal distribution. # We again see significant deviations, so they were # not sampled from a log-normal distribution. # elapsed.time.log.z <-z.std.f(log(garret8q.df$elapsed.time)) png("Garret8q.QQ-plot.log-normal.png", width=432, height=432, units="px", res=72 ) plot( qqnorm(elapsed.time.log.z), main="Normal Q-Q log(elapsed.time)", xlab="Theoretical Quantiles", ylab="Sample Quantiles" ) abline(0,1) # see also qqline() dev.off() # ################################################### # Q-Q plot for a Poisson distribution # # Let's see if these data were sampled from a Poisson # distribution. Instead of qqnorm() which supports only # the normal distribution, we will use qqplot() that # supports arbitrary distributions. However, it requires # that you have that a distribution to use for comparison. # # In order to generate a Poisson distribution that resembles # the sample data, we can use the fitdistr() function that # is part of the MASS package. It will attempt to find a # Poisson distribution that is as close as possible to the # sampled data. Since the Poisson distribution is for discrete # variables, and our sampled data is continuous, we will ignore # fitdistr() warnings. We could scale the sample data to get rid # of the decimal points, but it is probably not worth the trouble. # # The fit is characterized by a single parameter lambda, # which is available in the "estimate" named element of # the list returned bu fitdistr(). # require(MASS) fit.poisson <- fitdistr(garret8q.df$elapsed.time, "Poisson") fit.poisson # Generate a random sample of a Poisson distribution that # is as large as our sampled data and has an appropriate # value of lambda. # elapsed.time.p <- rpois( length(garret8q.df$elapsed.time), fit.poisson$estimate ) # standardize the random samples elapsed.time.p.z <- z.std.f(elapsed.time.p) # Compare the SQL elapsed time data from Oracle to # a randomly generated Poisson distribution. Once again, # we see significant deviations from the line, so we conclude # that the observations were not sampled from a Poisson # distribution. # png("Garret8q.QQ-plot.poisson.png", width=432, height=432, units="px", res=72 ) plot( qqplot(elapsed.time.z, elapsed.time.p.z), main="Poisson Q-Q elapsed.time", xlab="Theoretical Quantiles", ylab="Sample Quantiles" ) abline(0,1) # see also qqline() dev.off()