/* RISE (Rule Induction from a Set of Exemplars) Pedro Domingos April 25, 1997 To use the RISE system: * Compile this file: cc -o rise rise.c -lm * Put your training data in a file named DF.data, and your test data in a file called DF.test. The format is similar to C4.5, without comments. One example per line, attributes separated by commas or spaces, unknowns represented by "?" or "*". * There's no names file. Instead, you give RISE a command line argument that is a string specifying the format of the data files. Each character in the string corresponds to an attribute. "s" means symbolic; "n" means numeric; "i" means ignore; and "c" means class. Thus you can ignore certain attributes, and the class can appear anywhere. For example, if you wanted to ignore the first attribute, the second and third were symbolic, the fourth and fifth were numeric, and the sixth was the class, your command to run rise would be: rise issnnc * The only thing this bare-bones version of RISE outputs is RISE's accuracy on the test set. Uncommenting the call to prls() in main() will produce a printout of the induced rules in tabular form (one rule per line, one attribute per column, ragged). The first four columns for each rule are: Laplace accuracy of the rule, number of training examples won by the rule, number of those examples correctly classified, and number of initial rules merged into this rule. The rule's antecedents start after the vertical bar ("|"). A "*" indicates the antecedent was dropped. A "?" indicates a missing value. Numeric antecedents are in the form of a range for the corresponding attribute, with the lower and upper limits separated by a colon (e.g., "4.1:5.9"). The predicted class is given after the arrow ("->"). Hint: If comprehensible output is a goal, ignore the large number of rules winning only a few examples, or none. Typically, only a few rules win more than a couple of examples, and they account for almost all of the accuracy. * If you need other stuff (e.g., the class assigned to each test example), let me know. * This basic version of RISE contains no speed-up method (e.g., windowing or partitioning). Thus, its running time is roughly quadratic in the number of examples and the number of attributes. This means it's OK for datasets up to 1000 examples / 50 or so attributes if you want an answer in minutes, and up to 10000 examples / 100 or so attributes if you want an answer overnight. Beyond that, you could try partitioning (i.e., learning m classifiers on m disjoint and exhaustive subsets of the dataset and combining them by voting), or waiting until I have added it to the code. * Needless to say, the current code is not robust with respect to errors in the data and command line. * There are some limits on #examples, #attributes, etc., which can be easily changed by finding them right below this. */ #include #include #include #include #define MaxAtts 101 /* max no. attributes + 1 (class) */ #define MaxClasses 30 /* max no. classes */ #define MaxExs 10000 /* max no. training exs */ #define MaxTest 5000 /* max no. test exs */ #define MaxVals 101 /* max no. vals for a symbolic attr + 1 (missing) */ #define MaxName 40 /* max length of a name */ #define Symbolic 0 #define Numeric 1 #define Any 2 #define Class 3 #define Letter 4 #define Range 5 #define Ignore 6 #define Missing attdefs[a].nvals #define False 0 #define True 1 #define Same 1 #define Different 0 #define None -1 #define Infinity ((float) 1E37) #define MaxRand 32767 #define Min(x,y) ((x) <= (y) ? (x) : (y)) #define Max(x,y) ((x) >= (y) ? (x) : (y)) #define Square(x) ((x) * (x)) #define LaplaceAcc(r) ((float)rules[r].ncorr + 1) / (rules[r].nwon + nclasses); struct attribute { int type, symb; /* symbolic values are represented by integers */ /* type: Symbolic, Numeric or Any (unknown value) */ float lower, upper; /* upper and lower bounds for numeric value */ } examples[MaxExs][MaxAtts]; struct { int nstrule, sign; /* sign: whether nstrule's class is same as ex's */ float dist; /* distance to nstrule */ } exdata[MaxExs], newexdata[MaxExs]; struct rule { struct attribute ants[MaxAtts]; int nmerged; /* no. instances merged into rule; nmerged = 0 : deleted */ int nwon, ncorr; /* no. exs won, no. exs correct */ float acc; /* Laplace accuracy */ } rules[MaxExs]; struct attdef { int type, nvals, nmiss; float min, max, avg; char name[MaxName], vals[MaxVals][MaxName]; } attdefs[MaxAtts]; int class, nclasses, nexs, nrules, natts, nfields; /* class = (natts)th attribute */ float distances[MaxAtts][MaxVals][MaxVals]; int classfreqs[MaxClasses]; unsigned long int next = 1; /* seed for random number generator */ main(argc, argv) int argc; char *argv[]; { input(argv[1], True); compvdm(); initrules(); ibl(); rise(); /* prls(); */ input(argv[1], False); test(); exit(0); } ibl() /* RISE initialization: instance-based learning */ { int e, r, nstrule; float lwstdist; for (r = 0; r < nexs; r++) rules[r].nwon = rules[r].ncorr = 0; for (e = 0; e < nexs; e++) { shine(examples[e], &nstrule, &lwstdist, e); exdata[e].nstrule = nstrule; exdata[e].sign = (examples[e][class].symb == rules[nstrule].ants[class].symb); exdata[e].dist = lwstdist; rules[nstrule].nwon++; if (exdata[e].sign == Same) rules[nstrule].ncorr++; } for (r = 0; r < nexs; r++) rules[r].acc = LaplaceAcc(r); } rise() { int e, r, improving, nstex, newon, newcorr, oldcorr; float distance(), dist, mindist, newdist; struct rule oldrule; do { improving = False; for (r = 0; r < nrules; r++) if (rules[r].nmerged > 0) { /* find nearest example of same class */ mindist = Infinity; nstex = None; for (e = 0; e < nexs; e++) if (rules[r].ants[class].symb == examples[e][class].symb) { dist = distance(&rules[r], examples[e], mindist); if (dist > 0 && dist < mindist) { nstex = e; mindist = dist; } } if (mindist >= 0.9 * Infinity) continue; /* generalize to cover nearest example of same class */ copyrule(&oldrule, &rules[r]); msg(&rules[r], examples[nstex]); /* accept if there's an improvement on newly won examples */ newon = newcorr = oldcorr = 0; for (e = 0; e < nexs; e++) if (exdata[e].nstrule != r && (r != e || rules[r].nmerged > 1) && ((newdist = distance(&rules[r], examples[e], exdata[e].dist)) <= exdata[e].dist)) { newexdata[e].nstrule = r; newexdata[e].sign = (rules[r].ants[class].symb == examples[e][class].symb); newexdata[e].dist = newdist; newon++; if (newexdata[e].sign == Same) newcorr++; if (exdata[e].sign == Same) oldcorr++; } else newexdata[e].nstrule = None; if (newcorr > oldcorr /* if = : Occam's razor */ || newcorr == oldcorr && !eqrule(&oldrule, &rules[r])) { improving = True; rules[r].nwon += newon; rules[r].ncorr += newcorr; rules[r].acc = LaplaceAcc(r); for (e = 0; e < nexs; e++) if (newexdata[e].nstrule == r) { rules[exdata[e].nstrule].nwon--; if (exdata[e].sign == Same) rules[exdata[e].nstrule].ncorr--; rules[exdata[e].nstrule].acc = LaplaceAcc(exdata[e].nstrule); exdata[e].nstrule = r; exdata[e].sign = newexdata[e].sign; exdata[e].dist = newexdata[e].dist; } delduplics(r); } else copyrule(&rules[r], &oldrule); } } while (improving); } float distance(rl, ex, mindist) /* distance between rule and example */ struct rule *rl; struct attribute ex[]; float mindist; { int a; float dist = 0, di; struct attribute exatt, rulant; for (a = 0; a < natts; a++) if (dist > mindist) break; else { exatt = ex[a]; rulant = rl->ants[a]; if (rulant.type == Any || exatt.type == Symbolic && exatt.symb == rulant.symb || exatt.type == Numeric && exatt.lower <= rulant.upper && exatt.lower >= rulant.lower || exatt.type == Any) continue; else if (exatt.type == Symbolic) dist += Square(distances[a][rulant.symb][exatt.symb]); else if (exatt.type == Numeric) { di = Min(fabs(rulant.lower - exatt.lower), fabs(rulant.upper - exatt.lower)) / (attdefs[a].max - attdefs[a].min); dist += Square(di); } } return(dist); } msg(rl, ex) /* generalize rule minimally to include example */ struct rule *rl; struct attribute ex[]; { int a; for (a = 0; a < natts; a++) { if (rl->ants[a].type == Any || ex[a].type == Any || ex[a].type == Symbolic && ex[a].symb == rl->ants[a].symb || ex[a].type == Numeric && ex[a].lower <= rl->ants[a].upper && ex[a].lower >= rl->ants[a].lower) continue; else if (ex[a].type == Symbolic) rl->ants[a].type = Any; else if (ex[a].type == Numeric) { if (ex[a].lower > rl->ants[a].upper) rl->ants[a].upper = ex[a].upper + (nexs > 3000 ? 0.05 * (attdefs[a].max - attdefs[a].min) : 0); else /* ie. if (ex[a].lower < rl->ants[a].lower) */ rl->ants[a].lower = ex[a].lower - (nexs > 3000 ? 0.05 * (attdefs[a].max - attdefs[a].min) : 0); } } } copyrule(nr, or) struct rule *nr, *or; { int a; for (a = 0; a <= natts; a++) copyatt(&nr->ants[a], &or->ants[a]); nr->nmerged = or->nmerged; nr->nwon = or->nwon; nr->ncorr = or->ncorr; nr->acc = or->acc; } copyatt(na, oa) struct attribute *na, *oa; { na->type = oa->type; na->symb = oa->symb; na->lower = oa->lower; na->upper = oa->upper; } eqrule(r1, r2) struct rule *r1, *r2; { int a; for (a = 0; a <= natts; a++) if (!eqattr(&r1->ants[a], &r2->ants[a])) return(False); return(True); } eqattr(a1, a2) struct attribute *a1, *a2; { if (a1->type != a2->type || a1->type == Symbolic && a1->symb != a2->symb || a1->type == Numeric && (a1->lower != a2->lower || a1->upper != a2->upper)) return(False); else return(True); } delduplics(r) /* delete duplicate rules */ int r; { int q, a, e, same; for (q = 0; q < nrules; q++) { if (rules[q].nmerged == 0) continue; same = True; for (a = 0; a <= natts; a++) /* a=natts: class should also be the same */ if (!( rules[r].ants[a].type == Any && rules[q].ants[a].type == Any || rules[r].ants[a].type == Symbolic && rules[q].ants[a].type == Symbolic && rules[r].ants[a].symb == rules[q].ants[a].symb || rules[r].ants[a].type == Numeric && rules[q].ants[a].type == Numeric && rules[r].ants[a].lower == rules[q].ants[a].lower && rules[r].ants[a].upper == rules[q].ants[a].upper )) { same = False; break; } if (same && r != q) { rules[r].nmerged += rules[q].nmerged; rules[q].nmerged = 0; rules[r].nwon += rules[q].nwon; rules[r].ncorr += rules[q].ncorr; rules[r].acc = LaplaceAcc(r); for (e = 0; e < nexs; e++) if (exdata[e].nstrule == q) exdata[e].nstrule = r; } } } shine(ex, nstrule, lwstdist, leftout) /* classify example */ struct attribute ex[]; int *nstrule, leftout; float *lwstdist; { int r, fr, fn; float dist, distance(), ac, an; *nstrule = None; *lwstdist = Infinity; for (r = 0; r < nrules; r++) if (r != leftout && rules[r].nmerged > 0) { dist = distance(&rules[r], ex, *lwstdist); if (dist < *lwstdist || dist == *lwstdist && (ac = rules[r].acc) > (an = rules[*nstrule].acc) || dist == *lwstdist && ac == an && (fr = classfreqs[rules[r].ants[class].symb]) > (fn = classfreqs[rules[*nstrule].ants[class].symb]) || dist == *lwstdist && ac == an && fr == fn && ((float) randnum() / MaxRand > 0.5)) { *nstrule = r; *lwstdist = dist; } } return(rules[*nstrule].ants[class].symb); } initrules() { int a, c, e; nrules = nexs; /* compute class frequencies */ for (c = 0; c < nclasses; c++) classfreqs[c] = 0; for (e = 0; e < nexs; e++) classfreqs[examples[e][class].symb]++; /* initialize rules */ for (e = 0; e < nexs; e++) { for (a = 0; a <= natts; a++) copyatt(&rules[e].ants[a], &examples[e][a]); rules[e].nmerged = 1; rules[e].nwon = rules[e].ncorr = 0; rules[e].acc = (float) classfreqs[rules[e].ants[class].symb] / nexs; } } compvdm() /* compute value difference metric for symbolic attributes */ { int a, c, e, v, w, nvals, valfreqs[MaxVals]; float classvalfreqs[MaxClasses][MaxVals]; for (a = 0; a < natts; a++) if (attdefs[a].type == Symbolic) { nvals = attdefs[a].nvals; for (v = 0; v <= nvals; v++) { valfreqs[v] = 0; for (c = 0; c < nclasses; c++) classvalfreqs[c][v] = 0; } for (e = 0; e < nexs; e++) { valfreqs[examples[e][a].symb]++; classvalfreqs[examples[e][class].symb][examples[e][a].symb] += 1; } for (v = 0; v <= nvals; v++) for (c = 0; c < nclasses; c++) if (valfreqs[v] != 0) classvalfreqs[c][v] = classvalfreqs[c][v] / valfreqs[v]; for (v = 0; v <= nvals; v++) for (w = 0; w <= nvals; w++) if (valfreqs[v] == 0 || valfreqs[w] == 0) distances[a][v][w] = 1; else { distances[a][v][w] = 0; for (c = 0; c < nclasses; c++) distances[a][v][w] += fabs(classvalfreqs[c][v] - classvalfreqs[c][w]); } } } input(def, train) char def[]; int train; { int a, b, c, e, v, ch; char val[MaxName]; FILE *data; if (train) { parsedef(def); for (a = 0; a < nfields; a++) initattdef(a); data = fopen("DF.data", "r"); } else { struct attdef cattdefs[MaxAtts]; /* current, condensed attdefs */ for (a = 0; a <= natts; a++) copyattdef(&cattdefs[a], &attdefs[a]); parsedef(def); restore(cattdefs); data = fopen("DF.test", "r"); } /* read examples from data file */ e = 0; while ((ch = getc(data)) != EOF && e < MaxExs) { ungetc(ch, data); a = b = 0; while (getval(data, val, (attdefs[a].type == Letter), (attdefs[a].type == Range))) { if ((strcmp(val, "?") == 0 || strcmp(val, "*") == 0) && attdefs[a].type != Ignore) { examples[e][b].type = Any; attdefs[a].nmiss++; } else switch(attdefs[a].type) { case Symbolic: case Letter: case Class: examples[e][b].type = Symbolic; for (v = 0; v < attdefs[a].nvals; v++) if (strcmp(val, attdefs[a].vals[v]) == 0) { examples[e][b].symb = v; break; } if (v >= attdefs[a].nvals) { strcpy(attdefs[a].vals[v], val); examples[e][b].symb = v; attdefs[a].nvals++; } if (attdefs[a].type == Class && b != natts) { b--; examples[e][class].type = Symbolic; examples[e][class].symb = v; } break; case Numeric: case Range: examples[e][b].type = Numeric; sscanf(val, "%f", &examples[e][b].lower); if (attdefs[a].type == Numeric) examples[e][b].upper = examples[e][b].lower; else { c = 0; while (val[c] != ' ') c++; sscanf(&val[c], "%f", &examples[e][b].upper); examples[e][b].upper = examples[e][b].lower = (examples[e][b].upper + examples[e][b].lower) / 2; } if (train) { if (examples[e][b].lower > attdefs[a].max) attdefs[a].max = examples[e][b].lower; if (examples[e][b].lower < attdefs[a].min) attdefs[a].min = examples[e][b].lower; attdefs[a].avg += examples[e][b].lower; } break; case Ignore: b--; break; default: break; } a++; b++; if (b > natts + 1) break; } if (a == nfields) e++; else if (a != 0) error("Wrong length example", 0); } nexs = e; fclose(data); cleanup(train); } parsedef(def) /* parse example file definition string */ char def[]; { int a, c, nignored = 0; a = 0; c = 0; while (def[c] != '\0') { switch(def[c]) { case 's': attdefs[a].type = Symbolic; break; case 'l': attdefs[a].type = Letter; break; case 'c': attdefs[a].type = Class; break; case 'n': attdefs[a].type = Numeric; break; case 'r': attdefs[a].type = Range; break; case 'i': attdefs[a].type = Ignore; nignored++; break; case ' ': case '\t': case ',': case ':': case ';': a--; break; default: error("Unknown attribute type", 0); a--; break; } a++; c++; } nfields = a; natts = class = nfields - nignored - 1; } getval(data, val, isletter, isrange) /* reads an attribute value from the data file */ FILE *data; char val[]; int isletter, isrange; { char sep; int c, ch, cut; /* swallow separator */ do { sep = getc(data); if (sep == '\n') return(False); if (sep == EOF) error("Data file must end with a newline", 1); } while (sep == ',' || sep == ' ' || sep == '\t'); /* read value */ c = 0; val[0] = sep; do { c++; ch = getc(data); val[c] = (char) ch; if (isrange && (val[c] == '-' || val[c] == ':')) cut = c; } while (val[c] != ' ' && val[c] != ',' && val[c] != '\t' && val[c] != '\n' && val[c] != EOF && !isletter); /* housekeeping */ if (val[c] == EOF) error("Data file must end with a newline", 1); ungetc(ch, data); val[c] = '\0'; if (isrange) val[cut] = ' '; return(True); } cleanup(train) /* del ignoreds, move class to end in "attdefs" array, change types to symbolic or numeric only, convert missing, store avg */ int train; { int a, b, e; struct attdef classdef; for (a = b = 0; a < nfields; a++, b++) switch(attdefs[a].type) { case Symbolic: copyattdef(&attdefs[b], &attdefs[a]); break; case Letter: copyattdef(&attdefs[b], &attdefs[a]); attdefs[b].type = Symbolic; break; case Class: copyattdef(&classdef, &attdefs[a]); classdef.type = Symbolic; b--; break; case Numeric: copyattdef(&attdefs[b], &attdefs[a]); if (train) attdefs[b].avg /= nexs; break; case Range: copyattdef(&attdefs[b], &attdefs[a]); if (train) attdefs[b].avg /= nexs; attdefs[b].type = Numeric; break; case Ignore: b--; break; default: break; } copyattdef(&attdefs[class], &classdef); nclasses = attdefs[class].nvals; /* convert missing symbolic values to vdm-comparable values */ for (a = 0; a < natts; a++) if (attdefs[a].type == Symbolic) for (e = 0; e < nexs; e++) if (examples[e][a].type == Any) { examples[e][a].type = Symbolic; examples[e][a].symb = Missing; } } initattdef(a) int a; { int v; /* attdefs[a].type set by parsedef() */ attdefs[a].max = -Infinity; attdefs[a].min = Infinity; attdefs[a].avg = 0; attdefs[a].nvals = attdefs[a].nmiss = 0; attdefs[a].name[0] = '\0'; for (v = 0; v < MaxVals; v++) attdefs[a].vals[v][0] = '\0'; } tcopyattdef(nd, od) /* copy attdef except type */ struct attdef *nd, *od; { int v; nd->min = od->min; nd->max = od->max; nd->avg = od->avg; nd->nmiss = od->nmiss; strcpy(nd->name, od->name); for (v = 0; v < od->nvals; v++) strcpy(nd->vals[v], od->vals[v]); for (v = od->nvals; v < nd->nvals; v++) nd->vals[v][0] = '\0'; /* if ovals < nvals, blanks out rest */ nd->nvals = od->nvals; } copyattdef(nd, od) /* copy complete attdef */ struct attdef *nd, *od; { int v; nd->type = od->type; tcopyattdef(nd, od); } restore(cattdefs) /* restore raw attdefs, with ignoreds and class in orig position */ struct attdef cattdefs[]; /* current, condensed attdefs */ { int a, b; for (a = b = 0; a < nfields; a++, b++) switch(attdefs[a].type) { case Symbolic: case Letter: case Numeric: case Range: tcopyattdef(&attdefs[a], &cattdefs[b]); break; case Class: tcopyattdef(&attdefs[a], &cattdefs[class]); b--; break; case Ignore: initattdef(a); b--; break; default: break; } } test() /* apply RISE to a set of test examples and measure accuracy */ { int e, nstrule, hits = 0; float lwstdist; for (e = 0; e < nexs; e++) if (shine(examples[e], &nstrule, &lwstdist, None) == examples[e][class].symb) hits++; printf("RISE's accuracy: %5.1f %%\n", 100 * (float) hits / nexs); } error(msg, code) char *msg; int code; { if (code > 0) { printf("** RISE: Error: %s. **\n", msg); exit(code); } else printf("** RISE: Warning: %s. **\n", msg); } randnum() { next = next * 1103515245 + 12345; return((unsigned int) (next / 65536) % 32768); } prx(ex) /* print one example, given by pointer */ struct attribute ex[]; { int a; for (a = 0; a < natts; a++) { if (ex[a].type == Any || ex[a].type == Symbolic && ex[a].symb == Missing) printf("* "); else if (ex[a].type == Symbolic) /* printf("%d ", ex[a].symb); */ printf("%s ", attdefs[a].vals[ex[a].symb]); else if (ex[a].type == Numeric) printf("%g ", ex[a].lower); } /* printf("-> %d", ex[class].symb); */ printf("-> %s", attdefs[class].vals[ex[class].symb]); printf("\n"); } pre(e) /* print one example, given by index */ int e; { prx(examples[e]); } pres() /* print all examples */ { int e; for (e = 0; e < nexs; e++) pre(e); printf("\n"); } prl(r) /* print one rule */ int r; { int a; printf("%5.3f %3d %3d %3d | ", rules[r].acc, rules[r].nwon, rules[r].ncorr, rules[r].nmerged); for (a = 0; a < natts; a++) { if (rules[r].ants[a].type == Any) printf("* "); else if (rules[r].ants[a].type == Symbolic && rules[r].ants[a].symb == Missing) printf("? "); else if (rules[r].ants[a].type == Symbolic) /* printf("%d ", rules[r].ants[a].symb); */ printf("%s ", attdefs[a].vals[rules[r].ants[a].symb]); else if (rules[r].ants[a].type == Numeric) { if (rules[r].ants[a].lower < -0.9 * Infinity && rules[r].ants[a].upper > 0.9 * Infinity) printf("* "); else if (rules[r].ants[a].lower < -0.9 * Infinity) printf ("<%g ", rules[r].ants[a].upper); else if (rules[r].ants[a].upper > 0.9 * Infinity) printf (">%g ", rules[r].ants[a].lower); else printf("%g:%g ", rules[r].ants[a].lower, rules[r].ants[a].upper); } } /* printf("-> %d", rules[r].ants[class].symb); */ printf("-> %s", attdefs[class].vals[rules[r].ants[class].symb]); printf("\n"); } prls() /* print all active rules */ { int r; for (r = 0; r < nrules; r++) if (rules[r].nmerged > 0) prl(r); printf("\n"); }