# Dave Abercrombie, dabercrombie@cconvio.com # See http://aberdave.blogspot.com # March 2011 db9.a.raw <- read.table("db9-awr.csv", header=TRUE, sep=",", quote="\"" ) > ls() [1] "db9.a.raw" > dim(db9.a.raw) [1] 175 8 > names(db9.a.raw) [1] "SNAP_ID" "BEGIN_HOUR" "DB_CPU_CSPH" "DB_TIME_CSPH" [5] "PCNT_CPU_UTILIZE" "WAIT_CSPH" "USER_CALLS_PH" "BUFFER_GETS_PH" ####################################### # To convert CentiSeconds Per Hour (_CSPH) to milliseconds # per milliseconds (_MSPMS), we divided by 3600*100, We also # take advantage of the dimensionless nature of this metric: # a second-per-second is the same as a millisecond-per-millisecond # # 569525 DB_TIME_CSPH # 5695.25 sec per hour - divide by 100 then by 3600 # 1.5820138889 sec per second - same as ms per ms (mspms) # # But to convert counts Per Hour (_PH) to counts per millisecond # we need to divude by 3600*1000 (the 3600 converts hours to seconds # and the 1000 converts seconds to milliseconds). db9.b.milliseconds <- data.frame( SNAP_ID = db9.a.raw$SNAP_ID, BEGIN_HOUR = db9.a.raw$BEGIN_HOUR, DB_CPU_MSPMS = db9.a.raw$DB_CPU_CSPH/(3600*100), DB_TIME_MSPMS = db9.a.raw$DB_TIME_CSPH/(3600*100), PCNT_CPU_UTILIZE = db9.a.raw$PCNT_CPU_UTILIZE, WAIT_MSPMS = db9.a.raw$WAIT_CSPH/(3600*100), USER_CALLS_PMS = db9.a.raw$USER_CALLS_PH/(3600*1000), BUFFER_GETS_PMS = db9.a.raw$BUFFER_GETS_PH/(3600*1000) ) > names(db9.b.milliseconds) [1] "SNAP_ID" "BEGIN_HOUR" "DB_CPU_MSPMS" "DB_TIME_MSPMS" [5] "PCNT_CPU_UTILIZE" "WAIT_MSPMS" "USER_CALLS_PMS" "BUFFER_GETS_PMS" > dim(db9.b.milliseconds) [1] 175 8 > ####################################### db9.b.milliseconds.lm <- lm(formula = PCNT_CPU_UTILIZE ~ BUFFER_GETS_PMS, data = db9.b.milliseconds ) summary(db9.b.milliseconds.lm) Call: lm(formula = db9.a$PCNT_CPU_UTILIZE ~ db9.a$BUFFER_GETS_PH) Residuals: Min 1Q Median 3Q Max -6.217 -2.509 -1.079 0.865 16.940 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -9.360e+00 1.985e+00 -4.716 4.93e-06 *** db9.a$BUFFER_GETS_PH 8.201e-08 3.179e-09 25.799 < 2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 4.436 on 173 degrees of freedom Multiple R-squared: 0.7937, Adjusted R-squared: 0.7925 F-statistic: 665.6 on 1 and 173 DF, p-value: < 2.2e-16 ####################################### png("db9-awr.plot-01.utilization-workload.175.scatter.png", width=800, height=600, units="px", res=72 ) plot( PCNT_CPU_UTILIZE ~ BUFFER_GETS_PMS, data=db9.b.milliseconds, main="CPU utilization as function of workoad, all 175 points", sub="db9.b.milliseconds", ylab="percent CPU utilization", xlab="workload: buffer gets per millisecond" ) abline(coef(db9.b.milliseconds.lm)) dev.off() ####################################### png("db9-awr.plot-02.utilization-workload.175.residuals.png", width=800, height=600, units="px", res=72 ) hist(residuals(db9.b.milliseconds.lm)) dev.off() ####################################### # a temporary aside - this URL is generally helpful # http://www.cyclismo.org/tutorial/R/linearLeastSquares.html > attributes(db9.b.milliseconds.lm) $names [1] "coefficients" "residuals" "effects" "rank" [5] "fitted.values" "assign" "qr" "df.residual" [9] "xlevels" "call" "terms" "model" $class [1] "lm" ####################################### # remove outliers db9.c.cleaned <- subset( cbind(db9.b.milliseconds, residuals(db9.b.milliseconds.lm)), residuals(db9.b.milliseconds.lm) < 8.0 ) > dim(db9.c.cleaned) [1] 161 9 ####################################### # lineal model regression for cleaned dataset db9.c.cleaned.lm <- lm(formula = PCNT_CPU_UTILIZE ~ BUFFER_GETS_PMS, data = db9.c.cleaned ) > summary(db9.c.cleaned.lm) Call: lm(formula = db9.c.cleaned$PCNT_CPU_UTILIZE ~ db9.c.cleaned$BUFFER_GETS_PMS) Residuals: Min 1Q Median 3Q Max -4.2937 -1.1895 -0.2859 0.8509 7.8559 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -4.755466 0.853090 -5.574 1.04e-07 *** db9.c.cleaned$BUFFER_GETS_PMS 0.261177 0.004981 52.438 < 2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 1.847 on 159 degrees of freedom Multiple R-squared: 0.9453, Adjusted R-squared: 0.945 F-statistic: 2750 on 1 and 159 DF, p-value: < 2.2e-16 ####################################### png("db9-awr.plot-03.utilization-workload.161.scatter.png", width=800, height=600, units="px", res=72 ) plot( PCNT_CPU_UTILIZE ~ BUFFER_GETS_PMS, data=db9.c.cleaned, main="CPU utilization as function of workoad, 161 points (removed 14 outliers)", sub="db9.c.cleaned", ylab="percent CPU utilization", xlab="workload: buffer gets per millisecond" ) abline(coef(db9.c.cleaned.lm)) dev.off() ####################################### # cleaned data residuals look normally distributed png("db9-awr.plot-04.utilization-workload.161.residuals.png", width=800, height=600, units="px", res=72 ) hist(residuals(db9.c.cleaned.lm)) dev.off() ####################################### # service time by workload # linear model # # serviceTime = (milliseconds of CPU per unit of time) / (units of work per unit of time) # serviceTime = milliseconds of CPU per unit of work # unit of time is one millisecond # unit of work is one buffer get # serviceTime = DB_CPU_MSPMS / BUFFER_GETS_PMS # # workload = units of work per unit of time # workload = BUFFER_GETS_PMS # db9.d.serviceTime.by.workload.lm <- lm(formula = DB_CPU_MSPMS/BUFFER_GETS_PMS ~ BUFFER_GETS_PMS, data = db9.c.cleaned ) # R-squared around 0.19 (below) means that only about 20% of # the variance in service time can be explained by # variation of workload, the remaining 80% of the # variance in service time is unexplained by this model # (Snedcor & Cochran p. 183) summary(db9.d.serviceTime.by.workload.lm) Call: lm(formula = DB_CPU_MSPMS/BUFFER_GETS_PMS ~ BUFFER_GETS_PMS, data = db9.c.cleaned) Residuals: Min 1Q Median 3Q Max -1.177e-03 -2.317e-04 -7.321e-05 1.948e-04 1.202e-03 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 8.149e-03 1.898e-04 42.932 < 2e-16 *** BUFFER_GETS_PMS 6.792e-06 1.108e-06 6.129 6.71e-09 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.0004109 on 159 degrees of freedom Multiple R-squared: 0.1911, Adjusted R-squared: 0.186 F-statistic: 37.56 on 1 and 159 DF, p-value: 6.712e-09 # so lest just look at summary stats # half of the serviceTime values are betwwen 0.00906 and 0.00949 # milliseconds per buffer get. # # So I'll use a constant serviceTime of 0.009295 milliseconds per buffer get. # > summary( db9.c.cleaned$DB_CPU_MSPMS/db9.c.cleaned$BUFFER_GETS_PMS) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.008010 0.009060 0.009249 0.009295 0.009487 0.011010 > > mean( db9.c.cleaned$DB_CPU_MSPMS/db9.c.cleaned$BUFFER_GETS_PMS) [1] 0.00929524 > ####################################### png("db9-awr.plot-05.response-time.161.scatter.png", width=800, height=600, units="px", res=72 ) plot(DB_TIME_MSPMS/BUFFER_GETS_PMS ~ BUFFER_GETS_PMS, data = db9.c.cleaned, main="Response and Service time as function of workoad, 161 observations", sub="db9.c.cleaned", ylab="response and service time, milliseconds per buffer get", xlab="workload: buffer gets per millisecond", pch=20, xlim=c(0,300), ylim=c(0,0.04)) par(new=T) plot(DB_CPU_MSPMS/BUFFER_GETS_PMS ~ BUFFER_GETS_PMS, data = db9.c.cleaned, main="Response and Service time as function of workoad, 161 observations", sub="db9.c.cleaned", ylab="response and service time, milliseconds per buffer get", xlab="workload: buffer gets per millisecond", pch=3, xlim=c(0,300), ylim=c(0,0.04)) par(new=F) dev.off() ####################################### # remove response time outliers as if they were linear db9.e.responseTime.by.workload.lm <- lm(formula = DB_TIME_MSPMS/BUFFER_GETS_PMS ~ BUFFER_GETS_PMS, data = db9.c.cleaned ) png("db9-awr.plot-06.response-time.161.residuals.png", width=800, height=600, units="px", res=72 ) hist(residuals(db9.e.responseTime.by.workload.lm)) dev.off() db9.f.cleaned <- subset( cbind(db9.c.cleaned, residuals(db9.e.responseTime.by.workload.lm)), residuals(db9.e.responseTime.by.workload.lm) < 0.005 ) > dim(db9.f.cleaned) [1] 156 10 png("db9-awr.plot-07.response-time.156.scatter.png", width=800, height=600, units="px", res=72 ) plot(DB_TIME_MSPMS/BUFFER_GETS_PMS ~ BUFFER_GETS_PMS, data = db9.f.cleaned, main="Response and Service time as function of workoad, 156 observations", sub="db9.f.cleaned", ylab="response and service time, milliseconds per buffer get", xlab="workload: buffer gets per millisecond", pch=20, xlim=c(0,300), ylim=c(0,0.04)) par(new=T) plot(DB_CPU_MSPMS/BUFFER_GETS_PMS ~ BUFFER_GETS_PMS, data = db9.f.cleaned, main="Response and Service time as function of workoad, 156 observations", sub="db9.f.cleaned", ylab="response and service time, milliseconds per buffer get", xlab="workload: buffer gets per millisecond", pch=3, xlim=c(0,300), ylim=c(0,0.04)) par(new=F) dev.off() # from the UAF R Tutorial # http://mercury.bio.uaf.edu/mercury/R/R.html#nonlinear db9.f.cleaned.nls <- nls(DB_TIME_MSPMS/BUFFER_GETS_PMS~0.009295/(1-(((0.009295*BUFFER_GETS_PMS)/num.servers)^num.servers)), data = db9.f.cleaned, start=list(num.servers=2.5), trace=TRUE) summary(db9.f.cleaned.nls) coef(db9.f.cleaned.nls) x <- seq(0, 260, length=100) y <- predict(db9.f.cleaned.nls,data.frame(BUFFER_GETS_PMS=x)) png("db9-awr.plot-08.response-time.156.nls.png", width=800, height=600, units="px", res=72 ) plot(DB_TIME_MSPMS/BUFFER_GETS_PMS ~ BUFFER_GETS_PMS, data = db9.f.cleaned, main="Response time as function of workoad, nls() fit: 2.69 effective servers (Std. Error=0.01)", sub="db9.f.cleaned", ylab="response time, milliseconds per buffer get", xlab="workload: buffer gets per millisecond", pch=20, xlim=c(0,300), ylim=c(0,0.04)) par(new=T) lines(x,y) par(new=F) dev.off() png("db9-awr.plot-09.response-time.156.nls.residuals.png", width=800, height=600, units="px", res=72 ) hist(residuals(db9.f.cleaned.nls)) dev.off()