# Linear Regression # May 24, 2012 # Genome 560 # generally good to type the variables (X, etc.) after making them, just to see # A simple regression: # use the "cats.txt" table, proportions of cells of one cell type in samples # from cats (taken in our department many years ago). Column 1 is the # ID number of the particular cat. You will want to plot the data from one # cat. For example cat 40004 is rows 1:17, 40005a is 18:31, 40005b is 32:47, # 40006 is 48:65, 40665 is 66:83 and so on. # read the table and make a data frame out of it a <- read.table("http://www.cs.washington.edu/homes/suinlee/genome560/data/cats.txt") X <- a[1:17,2] # X is the ages (in days) of sampling for that cat # (you could use some other cat) Y <- a[1:17,3] # Y is the fraction of cells of that cell type plot(X, Y, ylim=c(0,max(Y)+0.2)) # plot Y versus X. Could use "min" too rr <- lsfit(X, Y) # "rr" contains results of a Least Squares fit abline(rr,col="red") # add the least squares fit line to the plot, in red plot(X,rr$residuals) # a new plot of the residuals versus X. Helps? # question: are the residuals independent and of # equal variance? Are successive residuals correlated? ls.print(rr) # look at estimates and tests for the fit # is the slope significantly different from zero? # # Now for weighted regressions # # #w <- a[1:17,4] # number of cells sampled for that cat each day # # points sampled from fewer cells should vary # # more, so we want to weight the regression # # proportional to the number of cells sampled # #mm <- lsfit(X,Y,wt=w) # now do a weighted fit emphasizing small Y's # #abline(mm, col="blue") # does it look better? # #plot(X,mm$residuals) # do the residuals look of equal variance, independent? #plot(X,mm$residuals*sqrt(w)) # scaling residuals so they might have equal variances. #ls.print(mm) # look at estimates, tests for m # do a multiple regression, using instead of the cat data, a simulation # where we know the truth X1 <- rnorm(100,mean=10,sd=5) # 100 random normal quantities, one variable X2 <- rnorm(100,mean=-3,sd=4) # another 100 with a different mean and variance # Now we calculate observations -- a linear combination of these, plus more noise Y <- 1.1 + 0.02*X1 + 0.3*X2 + rnorm(100,mean=0,sd=3) # note it mostly depends on X2 # Maybe try plotting the Y against X1, against X2? # Now we see whether we can find the truth: X <- cbind(X1,X2) # X1, X2, side-by-side become columns of "design matrix" r2 <- lsfit(X,Y) # Zap! Pow! least squares multiple regression! ls.print(r2) # what did it conclude? Was there a nonzero slope in X1? # in X2? # How could you set up X's so as to do (say) a quadratic regression such as # Y = a X^2 + b X + c using the multiple regression machinery ? # Hint: make up the right fake variables. One is X1, one is X1^2 # Try it. Here we make up simulated data again: Y <- 1.1 + 0.12*X1 - 0.3*X1^2 + rnorm(100,mean=0,sd=3) # a quadratic with noise # can you plot Y versus X1? figure out how to do a multiple regression on X1 and X1^2? # hint -- use the "poly(X1,degree=2,raw=TRUE)" formula