# Linear Regression
# May 24, 2012
# Genome 560

# generally good to type the variables (X, etc.) after making them, just to see

# A simple regression:
# use the "cats.txt" table, proportions of cells of one cell type in samples
# from cats (taken in our department many years ago).   Column 1 is the
# ID number of the particular cat.   You will want to plot the data from one
# cat.  For example cat 40004 is rows 1:17, 40005a is 18:31, 40005b is 32:47,
# 40006 is 48:65, 40665 is 66:83 and so on.

# read the table and make a data frame out of it
a <- read.table("http://www.cs.washington.edu/homes/suinlee/genome560/data/cats.txt")  

X <- a[1:17,2]               # X is the ages (in days) of sampling for that cat
                             # (you could use some other cat)
Y <- a[1:17,3]               # Y is the fraction of cells of that cell type

plot(X, Y, ylim=c(0,max(Y)+0.2))  # plot Y versus X.  Could use "min" too

rr <- lsfit(X, Y)         # "rr" contains results of a Least Squares fit

abline(rr,col="red")      # add the least squares fit line to the plot, in red

plot(X,rr$residuals)      # a new plot of the residuals versus X.  Helps?
                          # question: are the residuals independent and of
                          # equal variance?  Are successive residuals correlated?

ls.print(rr)              # look at estimates and tests for the fit
                          # is the slope significantly different from zero?

#                          # Now for weighted regressions
#                          #
#w <- a[1:17,4]            # number of cells sampled for that cat each day
#                          # points sampled from fewer cells should vary
#                          # more, so we want to weight the regression
#                          # proportional to the number of cells sampled
#
#mm <- lsfit(X,Y,wt=w)     # now do a weighted fit emphasizing small Y's
#
#abline(mm, col="blue")    # does it look better?
#
#plot(X,mm$residuals)         # do the residuals look of equal variance, independent?
#plot(X,mm$residuals*sqrt(w)) # scaling residuals so they might have equal variances.
#ls.print(mm)                 # look at estimates, tests for m

# do a multiple regression, using instead of the cat data, a simulation
# where we know the truth

X1 <- rnorm(100,mean=10,sd=5)  # 100 random normal quantities, one variable
X2 <- rnorm(100,mean=-3,sd=4)  # another 100 with a different mean and variance

# Now we calculate observations -- a linear combination of these, plus more noise

Y <- 1.1 + 0.02*X1 + 0.3*X2 + rnorm(100,mean=0,sd=3)   # note it mostly depends on X2
                         # Maybe try plotting the Y against X1, against X2?
                         # Now we see whether we can find the truth:
X <- cbind(X1,X2)        # X1, X2, side-by-side become columns of "design matrix"
r2 <- lsfit(X,Y)         # Zap! Pow! least squares multiple regression!
ls.print(r2)             # what did it conclude?  Was there a nonzero slope in X1?
                         # in X2?

# How could you set up X's so as to do (say) a quadratic regression such as
#  Y = a X^2 + b X + c    using the multiple regression machinery ?
# Hint: make up the right fake variables.   One is X1, one is X1^2
# Try it.   Here we make up simulated data again:
Y <- 1.1 + 0.12*X1 - 0.3*X1^2 + rnorm(100,mean=0,sd=3)  # a quadratic with noise
# can you plot Y versus X1?  figure out how to do a multiple regression on X1 and X1^2?
# hint -- use the  "poly(X1,degree=2,raw=TRUE)" formula