/* Simple EM demo for fitting a mixture of two classes to data in two Boolean variables. Use: emdemo seed #iterations #00 #01 #10 #11 where seed = integer used to initialize random number generator #iterations = how many EM iterations to run #00 = how many data elements are <0, 0> #01 = how many data elements are <0, 1> #10 = how many data elements are <1, 0> #11 = how many data elements are <1, 1> Assumes a naive Bayes model: Class / \ var0 var1 Uses EM to estimate parameters of Bayesian model: - Probability of choice of class c0 or c1 P(C) - Probability tables for var0 and var1 conditioned on choice of class P(V0 | C) P(V1 | C) After each iteration prints the estimated probability of class given a data element P(C | V0 V1), the new parameters of the model, and the log likelihood of the data. */ #include #include #include int data[4]; /* data[combination] -- number of training instances of combination */ float data_class[4][2]; /* data_class[combination][class] -- combination is 00, 01, 10, 11 */ float prior[2]; /* prior[class] - prob instance is class */ float cpt[2][2][2]; /* cpt[variable][parent class][false / true] */ int seed; int numit; float logmarginal[4]; /* logmarginal[combo] */ float loglike; void normalize(int n, float d[]) { double f = 0.0; int i; for (i=0; i>1]; } normalize(2, data_class[combo]); } printf("Iteration %d\n", i); printf(" E-step\n"); printf(" P(c1|00)= %8.6f P(c2|00)= %8.6f\n", data_class[0][0], data_class[0][1]); printf(" P(c1|01)= %8.6f P(c2|01)= %8.6f\n", data_class[1][0], data_class[1][1]); printf(" P(c1|10)= %8.6f P(c2|10)= %8.6f\n", data_class[2][0], data_class[2][1]); printf(" P(c1|11)= %8.6f P(c2|11)= %8.6f\n", data_class[3][0], data_class[3][1]); /* Recalculate prior and cpt */ for (c=0;c<2;c++){ prior[c] = 0.0; for (combo=0; combo<4; combo++) prior[c] += data[combo]*data_class[combo][c]; } normalize(2, prior); for (c=0;c<2;c++){ cpt[0][c][0] = data[0]*data_class[0][c] + data[2]*data_class[2][c]; cpt[0][c][1] = data[1]*data_class[1][c] + data[3]*data_class[3][c]; normalize(2, cpt[0][c]); cpt[1][c][0] = data[0]*data_class[0][c] + data[1]*data_class[1][c]; cpt[1][c][1] = data[2]*data_class[2][c] + data[3]*data_class[3][c]; normalize(2, cpt[1][c]); } printf(" M-step\n"); printf(" P(c0)= %8.6f P(c1)= %8.6f\n", prior[0], prior[1]); printf(" P(~v0|c0)= %8.6f P(v0|c0)= %8.6f\n", cpt[0][0][0], cpt[0][0][1]); printf(" P(~v0|c1)= %8.6f P(v0|c1)= %8.6f\n", cpt[0][1][0], cpt[0][1][1]); printf(" P(~v1|c0)= %8.6f P(v1|c0)= %8.6f\n", cpt[1][0][0], cpt[1][0][1]); printf(" P(~v1|c1)= %8.6f P(v1|c1)= %8.6f\n", cpt[1][1][0], cpt[1][1][1]); /* Compute the log likelihood of the data */ logmarginal[0] = log((cpt[0][0][0] * cpt[1][0][0] * prior[0]) + (cpt[0][1][0] * cpt[1][1][0] * prior[1])); logmarginal[1] = log((cpt[0][0][1] * cpt[1][0][0] * prior[0]) + (cpt[0][1][1] * cpt[1][1][0] * prior[1])); logmarginal[2] = log((cpt[0][0][0] * cpt[1][0][1] * prior[0]) + (cpt[0][1][0] * cpt[1][1][1] * prior[1])); logmarginal[3] = log((cpt[0][0][1] * cpt[1][0][1] * prior[0]) + (cpt[0][1][1] * cpt[1][1][1] * prior[1])); loglike = 0.0; for (combo=0; combo<4; combo++) loglike += data[combo] * logmarginal[combo]; if (isnan(loglike)) loglike =0.0; printf(" Log likelihood = %8.1f\n", loglike); printf("\n"); } } int main(int argc, char** argv) { if (argc != 7){ fprintf(stderr, "Use: emdemo seed #iterations #00 #01 #10 #11\n"); exit(1); } seed = atoi(argv[1]); srand(seed); numit = atoi(argv[2]); data[0] = atoi(argv[3]); data[1] = atoi(argv[4]); data[2] = atoi(argv[5]); data[3] = atoi(argv[6]); emdemo(); return 0; }