# Dave Abercrombie, aberdave.blogspot.com
# June 11 2011
#
# Source of data:
# http://shallahamer-orapub.blogspot.com/2011/02/sql-statement-elapsed-times.html
# http://filezone.orapub.com/Research/sql_elapsed_distribution/AnalysisPack.zip
# Garret8qtkxy0g5d1p3_1.txt
# after "/usr/bin/cut -c1-108", as if SQL*Plus TRIMSPOOL had been set
ls() # see if we have any stuff lying around
# change as needed
setwd("/home/dave/Documents/R-play/cs/20110203-analysis-pack")
# Import white-space delimited text file into a data frame.
# Name the columns since the file has no header.
garret8q.df <- read.table("Garret8qtkxy0g5d1p3_1.txt")
colnames(garret8q.df) <- c(
"sql.id.plan.hash.value",
"sample.no",
"executions",
"disk.reads",
"buffer.gets",
"cpu.time",
"elapsed.time"
)
# Display the data frame structure and its first few rows
# for a visual sanity check.
str(garret8q.df)
head(garret8q.df)
# Calculate basic univariate stats for the elapsed time variable.
summary(garret8q.df$elapsed.time)
sd(garret8q.df$elapsed.time)
# compare this histogram to the (better?) density plot below
png("Garret8q.QQ-plot.histogram.png",
width=432, height=432, units="px", res=72 )
hist(garret8q.df$elapsed.time, breaks=20)
dev.off()
# ###################################################
# Explore distribution using density plot rather than
# histogram:
#
# "Many statisticians recommend using density plots
# instead of histograms because they are more robust
# and easier to read." (Adler 2010 p.238).
#
# So I demonstrate R's density() function below.
#
# Another exploratory plot is the Empirical Cumulative
# Distribution Function (ECDF) which I do not show here.
#
png("Garret8q.QQ-plot.density-estimate.png",
width=432, height=432, units="px", res=72 )
plot( density(garret8q.df$elapsed.time),
main="Density estimate of elapsed.time"
)
dev.off()
# ###################################################
# Quantile-Quantile (Q-Q) plots provide graphical
# comparison of sampled data to know distributions.
# It's a good idea to normalize the sampled data by
# subtracting its mean and dividing by its standard
# deviation. This is sometimes called z-score
# normalization or standardization. In R, we use
# the znorm() function to do this, and it is in the
# dprep package. But rather than installing and loading
# that package, I define a simple version here.
#
z.std.f <- function(x) { (x-mean(x))/sd(x) }
elapsed.time.z <-z.std.f(garret8q.df$elapsed.time)
# Prepare to do a Q-Q plot. We want to graphically
# compare the sample quantiles to the expected
# quantiles. If samples were taken from a normal
# distribution, the points would line up with a
# slope of 1 and and intercept of zero. Significant
# deviations from this line indicate a lack of fit to
# the distribution.
#
# We see significant deviations from the line, so
# we discard the notion that the samples came
# from a normal distribution.
#
png("Garret8q.QQ-plot.normal.png",
width=432, height=432, units="px", res=72 )
plot(
qqnorm(elapsed.time.z),
main="Normal Q-Q elapsed.time",
xlab="Theoretical Quantiles",
ylab="Sample Quantiles"
)
abline(0,1) # see also qqline()
dev.off()
# ###################################################
# Q-Q plot for log transformed data
#
# Lets try a log() transform of these data to see if
# they might have come from a log-normal distribution.
# We again see significant deviations, so they were
# not sampled from a log-normal distribution.
#
elapsed.time.log.z <-z.std.f(log(garret8q.df$elapsed.time))
png("Garret8q.QQ-plot.log-normal.png",
width=432, height=432, units="px", res=72 )
plot(
qqnorm(elapsed.time.log.z),
main="Normal Q-Q log(elapsed.time)",
xlab="Theoretical Quantiles",
ylab="Sample Quantiles"
)
abline(0,1) # see also qqline()
dev.off()
# ###################################################
# Q-Q plot for a Poisson distribution
#
# Let's see if these data were sampled from a Poisson
# distribution. Instead of qqnorm() which supports only
# the normal distribution, we will use qqplot() that
# supports arbitrary distributions. However, it requires
# that you have that a distribution to use for comparison.
#
# In order to generate a Poisson distribution that resembles
# the sample data, we can use the fitdistr() function that
# is part of the MASS package. It will attempt to find a
# Poisson distribution that is as close as possible to the
# sampled data. Since the Poisson distribution is for discrete
# variables, and our sampled data is continuous, we will ignore
# fitdistr() warnings. We could scale the sample data to get rid
# of the decimal points, but it is probably not worth the trouble.
#
# The fit is characterized by a single parameter lambda,
# which is available in the "estimate" named element of
# the list returned bu fitdistr().
#
require(MASS)
fit.poisson <- fitdistr(garret8q.df$elapsed.time, "Poisson")
fit.poisson
# Generate a random sample of a Poisson distribution that
# is as large as our sampled data and has an appropriate
# value of lambda.
#
elapsed.time.p <- rpois(
length(garret8q.df$elapsed.time),
fit.poisson$estimate
)
# standardize the random samples
elapsed.time.p.z <- z.std.f(elapsed.time.p)
# Compare the SQL elapsed time data from Oracle to
# a randomly generated Poisson distribution. Once again,
# we see significant deviations from the line, so we conclude
# that the observations were not sampled from a Poisson
# distribution.
#
png("Garret8q.QQ-plot.poisson.png",
width=432, height=432, units="px", res=72 )
plot(
qqplot(elapsed.time.z, elapsed.time.p.z),
main="Poisson Q-Q elapsed.time",
xlab="Theoretical Quantiles",
ylab="Sample Quantiles"
)
abline(0,1) # see also qqline()
dev.off()