/* microsat.c version 1.5e Eric Minch Dept. of Genetics Stanford University Medical Center Stanford, CA 94305 minch@lotka.stanford.edu Description of program: Todo list: !*Add chosen distance to diversities(?) !*Bootstrap diversities *Estimate of Nm (Slatkin eq. 15a) *?Correlation matrix for diversities ?Allele resolution ?Allow nominal data ?Histogram of alleles by locus by taxon ?Handle haplotypes (e.g., Y-chromosome data) ?Hierarchical levels ?Compound genotypes (e.g., duplicate loci) @Goodness-of-fit for trees of individuals Key for todo list: !!Last week ! Immediately ? Defer indefinitely * Move data description to statistics options @ Enable tree comparison analysis History: August 1998, v1.5e fixed anomaly in kinship distance with 1-kf transformation; formerly generated MAXDIST for no shared alleles, now generates 1 April 1997, v1.5d removed adjustment from average square, allowed adjusted delta mu square April 1997, v1.5c corrected flaw in adjustment to average square and kinship distances February 1997 finally fixed check for bootstraps in runs with small number of loci January 1997, v1.5b prevented overflow in calculating number of possible bootstraps November 1996, v1.5a corrected substitution of median for maximum in frequency table averages removed special characters (^2 and mu) from help text truncated locus and taxon names on output October 1996, v1.5: Goodman's standardized Rst Help option September 1996: frequencies and mean repeat size by locus or taxon allow adjustment of D1 and Dks: subtract pairwise average within-values diversities by locus or taxon distance matrix by locus pairwise taxon-specific alleles; number by locus and average over loci mean, median, mode, minimum, maximum, and frequency distribution by locus added entropy to diversities Dps, Dks, Dfs with -ln(proportion) or 1-proportion check for zero==missing data added maximum repeat size to diversities reorganized menu structure to separate data transformation options and I/O options from distance and statistics options August 1996: added average and total maximum allele size to diversity indices July 1996: added check for exhaustive bootstrapping March 1996: added Fst, D1, Dsw distances February 1996: added frequency counts added mean allele size to diversity indices fixed field-checking routine January 1996, v1.4g: added Fst Het, Het Tot, Var Tot, # alleles, total # alleles, allele range, total allele range, and their averages to the diversity indices December 1995, v1.4f: added absolute difference distance Dad September 1995, v1.4e: primer error defaults to 0 if alleles are repeats (or repeat length=1) September 1995, v1.4d: fixed pesky negative variances (integer overflow problem) June 1995, v1.4c: included checking for monomorphisms and trapping div by zero in Fst June 1995, v1.4b: included checking for correct number of fields May 1995, v1.4a: included trap for negative allele lengths May 1995, v1.4: replaced MAXINT with locally-defined BIGINT replaced malloc() with error-trapping myalloc() replaced buggy < with <= in counting loop for PS-, KF-, and NI-distances Suggested reference: Minch, E., Ruiz-Linares, A., Goldstein, D., Feldman, M., and Cavalli-Sforza, L.L. (1995, 1996) Microsat (version 1.5b): a computer program for calculating various statistics on microsatellite allele data. WWW: http://lotka.stanford.edu/microsat.html. Copyright statements: ------------------------------------------------------------------------------ Bag and Lexicon source code is (c) 1989 Eric Minch. All rights reserved. Microsat source code is (c) 1995 The Board of Trustees of the Leland Stanford Junior University. All rights reserved. ------------------------------------------------------------------------------ */ #include #include #include #include #include /* Utility macros */ #define MAX(a,b) (((a)>(b))?(a):(b)) #define MIN(a,b) (((a)<(b))?(a):(b)) #define PHI(r) ((r*r-1)/6) #define TAU(r) MAX(0, PHI(r)/(2*mutationRate)) /* Utility constants */ #define COMPLEMENT (~0) /* Statistics option flag constants */ #define MISSING 1 #define ANOMALY 2 #define COMPARE 4 #define SPECIFIC 8 #define OUTLIER 16 #define HINGE 32 #define LINEARITY 64 #define FREQUENCY 128 #define DIVERSITY 256 #define BYTAXON 512 #define PAIRWISE 1024 #define CROSSTAB 2048 /* Distance option flag constants */ #define NO 0 #define DM 1 #define PS 2 #define KF 3 #define FS 4 #define NI 5 #define AD 6 #define SW 7 #define RF 8 #define AS 9 #define SR 10 #define BYLOCUS 1 #define LOGARITHM 2 #define ADJUST 4 /* Help option constants */ #define MAIN 0 #define STATISTICS 1 #define DISTANCE 2 /* Bootstrap option flag constants */ #define ALL 1 #define MEAN 2 /* Default constants */ #define DEFAULTMUTATION 0.00056 #define DEFAULTPRIMER 0 #define DEFAULTREPEAT 2 #define DEFAULTSIGMA 3 #define DEFAULTHINGE 4 /* Other constants */ #define BUFFSIZE 1024 #define BIGINT 32767 #define MISSINGDATA -BIGINT #define MAXDIST 10.0 #define MAXLOCI 500 #define ERROR 0 #define ALPHA 0.05 /* Memory management */ void *myalloc(int n) { static long int total=0; void *memory=NULL; memory=malloc(n); total+=n; if (memory) return memory; printf("Error allocating %d bytes of memory; ", n); printf("total allocated so far was %ld\n", total-n); exit(1); } /* Field checking */ int fieldCheck(char *string) { /* Check the number of fields. No checking is done as to type or value. */ int fields; char *token; char buffer[BUFFSIZE]; strcpy(buffer, string); token=strtok(buffer, " \t\n"); fields=0; while (token) { fields++; token=strtok(NULL, " \t\n"); } if (fields==3||fields==4) return 1; if (fields) printf("Input error: %s", string); return 0; } /* Random numbers */ float uran(void) { /* This is provided only as a default uniform [0,1) pseudorandom number generator. The rand() function in particular is not ideal, and should be replaced by a call to your favorite pseudorandom number generator. */ return 1.0*rand()/RAND_MAX; } int iran(int n) { /* This is intended to return an integer in the range 0..n-1. */ int m; while ((m=floor(n*uran()))==n); return m; } double binomial(int n, int m) { /* Return n choose m */ int i, k; double answer; if (m>(n/2)) k=n-m; else k=m; if (k==0) return 1.0; if (k<0) return 0.0; if (k==1) return n; answer=1.0; for (i=1; i<=k; i++) answer*=1.0*(n-k+i)/i; return answer; } /* Bags */ struct Bag { int itemCount; int *data; }; struct Bag *initBag() { struct Bag *bag; bag=myalloc(sizeof(struct Bag)); bag->itemCount=1; bag->data=myalloc(sizeof(int)); bag->data[0]=0; return bag; } void freeBag(struct Bag *bag) { free(bag->data); free(bag); } void emptyBag(struct Bag *bag) { int i; for (i=0; iitemCount; i++) bag->data[i]=0; } void bump(struct Bag *bag, int index, int increment) { int i, count; int *newdata; count=bag->itemCount; if (index>=count) { newdata=myalloc((index+1)*sizeof(int)); for (i=0; idata[i]; for (i=count; i<=index; i++) newdata[i]=0; free(bag->data); bag->itemCount=index+1; bag->data=newdata; } bag->data[index]+=increment; } void empty(struct Bag *bag, int index) { if (indexitemCount) bag->data[index]=0; } int frequency(struct Bag *bag, int index) { if (indexitemCount) return bag->data[index]; else return 0; } int bagFrequency(struct Bag *bag) { int i, result; result=0; for (i=0; iitemCount; i++) result+=bag->data[i]; return result; } float bagMean(struct Bag *bag) { int i, frequency; float result, count; count=result=0.0; for (i=0; iitemCount; i++) if (frequency=bag->data[i]) { count+=frequency; result+=i*frequency; } if (count) return result/count; else return 0; } float bagVariance(struct Bag *bag) { int i; float frequency, sum, count, squares; count=sum=squares=0.0; for (i=0; iitemCount; i++) if (frequency=bag->data[i]) { count+=frequency; sum+=i*frequency; squares+=i*i*frequency; } if (count>1) { sum/=count; return (squares-count*sum*sum)/(count-1); } else return 0; } int bagCardinality(struct Bag *bag) { int i, result; result=0; for (i=0; iitemCount; i++) if (bag->data[i]) result++; return result; } int bagMedian(struct Bag *bag) { int i, index; index=bagFrequency(bag)/2; for (i=0; index>0; i++) if (index>bag->data[i]) index-=bag->data[i]; else return i; return i; } void addBag(struct Bag *aBag, struct Bag *bBag) { int i, n; for (i=0; iitemCount; i++) if (n=bBag->data[i]) bump(aBag, i, n); } struct Bag *bagUnion(struct Bag *aBag, struct Bag *bBag) { int i, n; struct Bag *result; result=initBag(); for (i=0; iitemCount; i++) if (n=aBag->data[i]) bump(result, i, n); for (i=0; iitemCount; i++) if (n=bBag->data[i]) bump(result, i, n); return result; } struct Bag *bagIntersection(struct Bag *aBag, struct Bag *bBag) { int i, n; struct Bag *result; result=initBag(); n=MIN(aBag->itemCount, bBag->itemCount); for (i=0; iitemCount; i++) if (bag->data[i]) return i; return -1; } int bagMaximum(struct Bag *bag) { int i; for (i=bag->itemCount-1; i>=0; i--) if (bag->data[i]) return i; return -1; } int bagLoHinge(struct Bag *bag) { int i, index; index=bagFrequency(bag)/4; for (i=0; index>0; i++) if (index>bag->data[i]) index-=bag->data[i]; else return i; return i; } int bagHiHinge(struct Bag *bag) { int i, index; index=bagFrequency(bag)/4; for (i=bag->itemCount; index>0; i--) if (index>bag->data[i]) index-=bag->data[i]; else return i; return i; } int bagMode(struct Bag *bag) { int i, index, max; max=0; for (i=0; iitemCount; i++) if (bag->data[i]>max) { max=bag->data[i]; index=i; } return index; } float bagEntropy(struct Bag *bag) { int i, m; float result, p; if (bagCardinality(bag)<=1) return 0.0; result=0.0; m=bagFrequency(bag); for (i=0; iitemCount; i++) if (p=1.0*bag->data[i]/m) result-=p*log(p); return result; } float continuousRelativeBagEntropy(struct Bag *bag) { float result; if (result=bagEntropy(bag)) result/=log(bagMaximum(bag)-bagMinimum(bag)+1); return result; } float *relativeFrequencies(struct Bag *bag) { int i; float totalFrequency; float *result; result=myalloc(bag->itemCount*sizeof(float)); totalFrequency=bagFrequency(bag); for (i=0; iitemCount; i++) result[i]=bag->data[i]/totalFrequency; return result; } /* Lexica */ struct LexEntry { char *text; int index; struct LexEntry *left, *right; }; struct Lexicon { int activeCount, totalCount; struct LexEntry *head; }; struct Lexicon *initLexicon() { struct Lexicon *lexicon; lexicon=myalloc(sizeof(struct Lexicon)); lexicon->activeCount=0; lexicon->totalCount=0; lexicon->head=NULL; return lexicon; } int findLexEntry(struct LexEntry *entry, char *token) { /* This returns the index of the token if it's in the lexicon, zero if absent. */ int position; if (entry==NULL) return 0; position=strcmp(token, entry->text); if (!position) { if (entry->index>0) return entry->index; else return 0; } else if (position<0) return findLexEntry(entry->left, token); else return findLexEntry(entry->right, token); } int findLexWord(struct Lexicon *lexicon, char *token) { return findLexEntry(lexicon->head, token); } struct LexEntry *initLexEntry(char *token, int n) { struct LexEntry *entry; entry=myalloc(sizeof(struct LexEntry)); entry->text=myalloc(strlen(token)+1); strcpy(entry->text, token); entry->index=n; entry->left=NULL; entry->right=NULL; return entry; } int getLexEntry(struct Lexicon *lexicon, struct LexEntry *entry, char *token) { /* This works like findLexEntry, except if the token is absent from the lexicon, it is added and the newly-generated entry's index is returned. */ int position; if (lexicon->totalCount==0) { lexicon->totalCount=1; lexicon->activeCount=1; lexicon->head=initLexEntry(token, 1); return 1; } position=strcmp(token, entry->text); if (position==0) { if (entry->index<0) { entry->index= -entry->index; lexicon->activeCount++; } return entry->index; } if (position<0) { if (entry->left!=NULL) return getLexEntry(lexicon, entry->left, token); else { lexicon->totalCount++; lexicon->activeCount++; entry->left=initLexEntry(token, lexicon->totalCount); return lexicon->totalCount; } } if (entry->right!=NULL) return getLexEntry(lexicon, entry->right, token); else { lexicon->totalCount++; lexicon->activeCount++; entry->right=initLexEntry(token, lexicon->totalCount); return lexicon->totalCount; } } int getLexWord(struct Lexicon *lexicon, char *token) { return getLexEntry(lexicon, lexicon->head, token); } char *lexItemNode(struct LexEntry *entry, int n) { char *answer; if (entry==NULL) return NULL; if (n==entry->index) return entry->text; else if (-n==entry->index) return NULL; answer=lexItemNode(entry->left, n); if (answer==NULL) answer=lexItemNode(entry->right, n); return answer; } char *lexWord(struct Lexicon *lexicon, int n) { return lexItemNode(lexicon->head, n); } /* Distance functions for main program */ void DMDistance(int taxa, int loci, struct Bag ***s, float **d, int *boot, int featureFlag) { /* Delta mu squared algorithm from Goldstein, D.B., Ruiz-Linares, A., Cavalli-Sforza, L.L., and Feldman, M.W. (1995) Genetic absolute dating based on microsatellites and the origin of modern humans, _Proc. National Academy of Science, USA_ 92:6723-6727. */ int i, j, k, ni, nj, locus; float num, diff, den; if (featureFlag&BYLOCUS) loci=1; for (i=1; i.\n"); break; case '!': if (menu==MAIN) printf("This begins the analysis.\n"); else if (menu==STATISTICS) printf("This returns to the main menu.\n"); else help(MAIN, '-'); break; case '0': if (menu==DISTANCE) printf("This returns to the main menu with no distance selected.\n"); else help(MAIN, '-'); break; default: strcpy(buffer, helptext[option-'A'+menu*('Z'-'A'+1)]); if (strlen(buffer)) printf("%s\n", buffer); else help(MAIN, '-'); break; } else help(MAIN, '-'); gets(buffer); } void getDistance(int *distanceFlag, int *featureFlag) { int n; char buffer[BUFFSIZE]; printf("\nSelect the type of distance measure you want:\n"); printf(" (A) Average square (D1)\n"); printf(" (D) Absolute difference (Dad)\n"); printf(" (F) Fst\n"); printf(" (K) Kinship coefficient (Dkf)\n"); printf(" (M) Delta mu squared (Ddm)\n"); printf(" (N) Nei's standard (Gst)\n"); printf(" (P) Proportion shared alleles (Dps)\n"); printf(" (R) Standardized Rst\n"); printf(" (S) Fuzzy set similarity (Dfs)\n"); printf(" (W) Absolute product (Dsw)\n"); printf(" (0) None\n"); printf(" (?X) Help on option X\n"); gets(buffer); n=toupper(buffer[0]); switch (n) { case '0': *distanceFlag=NO; break; case 'A': *distanceFlag=AS; break; case 'M': *distanceFlag=DM; break; case 'D': *distanceFlag=AD; break; case 'W': *distanceFlag=SW; break; case 'N': *distanceFlag=NI; break; case 'F': *distanceFlag=RF; break; case 'R': *distanceFlag=SR; break; case 'P': *distanceFlag=PS; break; case 'K': *distanceFlag=KF; break; case 'S': *distanceFlag=FS; break; case '?': help(DISTANCE, toupper(buffer[1])); getDistance(distanceFlag, featureFlag); break; default: help(MAIN, '-'); getDistance(distanceFlag, featureFlag); } if (*distanceFlag==PS||*distanceFlag==KF|| *distanceFlag==FS||*distanceFlag==NI) { do { switch (*distanceFlag) { case PS: sprintf(buffer, "ps"); break; case KF: sprintf(buffer, "kf"); break; case FS: sprintf(buffer, "fs"); break; case NI: sprintf(buffer, "ni"); break; } printf("Transform by (L) -ln(%s) or by (M) 1-(%s)? ", buffer, buffer); gets(buffer); n=toupper(buffer[0]); } while (n!='L'&&n!='M'); if (n=='L') *featureFlag|=LOGARITHM; else *featureFlag&=(LOGARITHM^COMPLEMENT); } if (*distanceFlag==KF) { printf("Adjust distance by subtracting pairwise average within? "); gets(buffer); if (toupper(buffer[0])=='Y') *featureFlag|=ADJUST; else *featureFlag&=(ADJUST^COMPLEMENT); } else if (*distanceFlag==DM) { printf("Adjust distance for small sample size? "); gets(buffer); if (toupper(buffer[0])=='Y') *featureFlag|=ADJUST; else *featureFlag&=(ADJUST^COMPLEMENT); } } void getStatistics(int *statisticsFlag, int *alleleFlag, int *outlierCriterion, float *mutationRate, int *primerError) { int n; char buffer[BUFFSIZE]; printf("\nSelect the statistics you want:\n"); printf(" %-30s [%s]\n", "(M)issing data", *statisticsFlag&MISSING ? "Yes" : "No"); printf(" %-30s [%s]\n", "(A)nomalous frequencies", *statisticsFlag&ANOMALY ? "Yes" : "No"); printf(" %-30s [%s]\n", "(I)ncommensurable taxa", *statisticsFlag&COMPARE ? "Yes" : "No"); printf(" %-30s [%s]\n", "(T)axon-specific alleles", *statisticsFlag&SPECIFIC ? "Yes" : "No"); printf(" %-30s [%s]\n", "(P)airwise-specific alleles", *statisticsFlag&PAIRWISE ? "Yes" : "No"); printf(" %-30s [%s]\n", "(O)utliers", *statisticsFlag&OUTLIER ? "Yes" : "No"); if (*statisticsFlag&OUTLIER) printf(" %-30s [%d %s]\n", "(C)riterion for outliers", *outlierCriterion, *statisticsFlag&HINGE ? "hinges" : "sigmas"); printf(" %-30s [%s]\n", "(X)tabulation", *statisticsFlag&CROSSTAB ? "Yes" : "No"); printf(" %-30s [%s]\n", "(L)inearity", *statisticsFlag&LINEARITY ? "Yes" : "No"); if (*statisticsFlag&LINEARITY) { printf(" %-30s [%f]\n", "(R)ate of mutation", *mutationRate); if (!*alleleFlag) printf(" %-30s [%d]\n", "(E)rror from primer", *primerError); } printf(" %-30s [%s]\n", "(F)requency distribution", *statisticsFlag&FREQUENCY ? "Yes" : "No"); printf(" %-30s [%s]\n", "(V)ariability", *statisticsFlag&DIVERSITY ? "Yes" : "No"); printf(" (?X) Help on option X\n"); printf(" %-30s\n", " ! Back to main menu"); gets(buffer); n=toupper(buffer[0]); switch (n) { case '?': help(STATISTICS, toupper(buffer[1])); break; case '!': return; case 'A': *statisticsFlag^=ANOMALY; break; case 'C': if (*statisticsFlag&HINGE) printf("Enter number of interhinge ranges (+/-): "); else printf("Enter number of standard deviations (+/-): "); gets(buffer); *outlierCriterion=atoi(buffer); break; case 'E': if (!*alleleFlag) { printf("Enter length of primer error: "); gets(buffer); *primerError=atoi(buffer); } break; case 'F': *statisticsFlag^=FREQUENCY; if (*statisticsFlag&FREQUENCY) { do { printf("%s by (L)ocus or by (T)axon? ", *statisticsFlag&DIVERSITY ? "Frequency and variability" : "Frequency"); gets(buffer); n=toupper(buffer[0]); } while (n!='L'&&n!='T'); if (n=='T') *statisticsFlag|=BYTAXON; else *statisticsFlag&=(BYTAXON^COMPLEMENT); } break; case 'I': *statisticsFlag^=COMPARE; break; case 'L': *statisticsFlag^=LINEARITY; break; case 'M': *statisticsFlag^=MISSING; break; case 'O': n=0; do { printf("(H)inge checking, (S)igma checking, or (N)one? "); gets(buffer); n=toupper(buffer[0]); } while (n!='H'&&n!='S'&&n!='N'); switch (n) { case 'H': *statisticsFlag|=OUTLIER; *statisticsFlag|=HINGE; *outlierCriterion=DEFAULTHINGE; break; case 'S': *statisticsFlag|=OUTLIER; *statisticsFlag&=(HINGE^COMPLEMENT); *outlierCriterion=DEFAULTSIGMA; break; case 'N': *statisticsFlag&=(OUTLIER^COMPLEMENT); break; } break; case 'P': *statisticsFlag^=PAIRWISE; break; case 'R': printf("Enter mutation rate: "); gets(buffer); *mutationRate=atof(buffer); break; case 'T': *statisticsFlag^=SPECIFIC; break; case 'V': *statisticsFlag^=DIVERSITY; if (*statisticsFlag&DIVERSITY) { do { printf("%s by (L)ocus or by (T)axon? ", *statisticsFlag&FREQUENCY ? "Frequency and variability" : "Variability"); gets(buffer); n=toupper(buffer[0]); } while (n!='L'&&n!='T'); if (n=='T') *statisticsFlag|=BYTAXON; else *statisticsFlag&=(BYTAXON^COMPLEMENT); } break; case 'X': *statisticsFlag^=CROSSTAB; break; } getStatistics(statisticsFlag, alleleFlag, outlierCriterion, mutationRate, primerError); } void getOptions(int *distanceFlag, int *statisticsFlag, int *featureFlag, int *outlierCriterion, int *alleleFlag, int *repeatLength, int *primerError, float *mutationRate, int *bootstraps, int *printFlag, char *infileName, char *outfileName) { int n; char buffer[BUFFSIZE]; printf("\nSelect the option you wish to change:\n"); /* Display data transformation options */ printf(" %-30s [%s]\n", "(A)llele sizes", *alleleFlag ? "# Repeats" : "# Nucleotides"); if (!*alleleFlag) printf(" %-30s [%d]\n", "(R)epeat length", *repeatLength); /* Display distance option */ printf(" %-30s", "(D)istances"); switch (*distanceFlag) { case AD: printf(" [Dad: Absolute difference]\n"); break; case AS: printf(" [D1: Average square]\n"); break; case DM: if (*featureFlag&ADJUST) printf(" [Ddm*: Adjusted delta mu squared]\n"); else printf(" [Ddm: Delta mu squared]\n"); break; case FS: if (*featureFlag&LOGARITHM) printf(" [Dfs: -ln(Fuzzy similarity)]\n"); else printf(" [Dfs': 1-(Fuzzy similarity)]\n"); break; case KF: if (*featureFlag&LOGARITHM) if (*featureFlag&ADJUST) printf(" [Dkf*: Adjusted -ln(Kinship coefficient)]\n"); else printf(" [Dkf: -ln(Kinship coefficient)]\n"); else if (*featureFlag&ADJUST) printf(" [Dkf*': Adjusted 1-(Kinship coefficient)]\n"); else printf(" [Dkf': 1-(Kinship coefficient)]\n"); break; case NI: if (*featureFlag&LOGARITHM) printf(" [Gst: -ln(Nei's identity)]\n"); else printf(" [Gst': 1-(Nei's identity)]\n"); break; case PS: if (*featureFlag&LOGARITHM) printf(" [Dps: -ln(Proportion shared alleles)]\n"); else printf(" [Dps': 1-(Proportion shared alleles)]\n"); break; case RF: printf(" [Fst]\n"); break; case SW: printf(" [Dsw: Absolute product]\n"); break; case SR: printf(" [Standardized Rst]\n"); break; default: *distanceFlag=NO; printf(" [No]\n"); break; } if (*distanceFlag&&(*bootstraps<=1)) printf(" %-30s [%s]\n", "(L)ocus-specific distances", *featureFlag&BYLOCUS ? "Yes" : "No"); /* Display statistics */ printf(" %-30s [%s]\n", "(S)tatistics", *statisticsFlag ? "Yes" : "No"); if (*statisticsFlag) { if (*statisticsFlag&MISSING) printf(" %-30s\n", " Missing data"); if (*statisticsFlag&ANOMALY) printf(" %-30s\n", " Anomalous frequencies"); if (*statisticsFlag&COMPARE) printf(" %-30s\n", " Incommensurable taxa"); if (*statisticsFlag&SPECIFIC) printf(" %-30s\n", " Taxon-specific alleles"); if (*statisticsFlag&PAIRWISE) printf(" %-30s\n", " Pairwise-specific alleles"); if (*statisticsFlag&CROSSTAB) printf(" %-30s\n", " Crosstabulation"); if (*statisticsFlag&OUTLIER) printf(" %-30s [%d %s]\n", " Outliers", *outlierCriterion, *statisticsFlag&HINGE ? "hinges" : "sigmas"); if (*statisticsFlag&LINEARITY) printf(" %-30s\n", " Linearity"); if (*statisticsFlag&FREQUENCY) printf(" %-30s [by %s]\n", " Frequency distribution", *statisticsFlag&BYTAXON ? "taxon" : "locus"); if (*statisticsFlag&DIVERSITY) printf(" %-30s [by %s]\n", " Variability", *statisticsFlag&BYTAXON ? "taxon" : "locus"); } /* Display bootstrap options here */ if (*bootstraps>1) if (*distanceFlag) printf(" %-30s [%s %d]\n", "(B)ootstraps", *printFlag==ALL ? "All of" : "Mean of", *bootstraps); else printf(" %-30s [%d]\n", "(B)ootstraps", *bootstraps); else if ((*distanceFlag&&!(*featureFlag&BYLOCUS))||*statisticsFlag&DIVERSITY) printf(" %-30s [0]\n", "(B)ootstraps"); /* Display I/O options */ printf(" %-30s [%s]\n", "(I)nput data from file name", infileName); printf(" %-30s [%s]\n", "(O)utput results to file name", outfileName); printf(" (?X) Help on option X\n"); printf(" %-30s\n", " ! Ready"); /* Pick up the choice */ gets(buffer); n=toupper(buffer[0]); switch (n) { case '?': help(MAIN, toupper(buffer[1])); break; case '!': if ((*distanceFlag)&&(*bootstraps<=1)) *printFlag=MEAN; if (*alleleFlag) { *repeatLength=1; *primerError=0; } return; case 'A': *alleleFlag=1- *alleleFlag; break; case 'B': printf("Enter number of bootstraps: "); gets(buffer); *bootstraps=atoi(buffer); *printFlag=ERROR; if (*distanceFlag&&*bootstraps) do { printf("Show (A)ll or (M)ean of bootstrapped distances? "); gets(buffer); switch (toupper(buffer[0])) { case 'A': *printFlag=ALL; break; case 'M': *printFlag=MEAN; break; default: *printFlag=ERROR; } } while (!*printFlag); break; case 'D': getDistance(distanceFlag, featureFlag); break; case 'I': printf("Enter name of input file: "); gets(buffer); strcpy(infileName, buffer); break; case 'L': *featureFlag^=BYLOCUS; break; case 'O': printf("Enter name of output file: "); gets(buffer); strcpy(outfileName, buffer); break; case 'R': if (!*alleleFlag) { printf("Enter length of repeat: "); gets(buffer); *repeatLength=atoi(buffer); } break; case 'S': getStatistics(statisticsFlag, alleleFlag, outlierCriterion, mutationRate, primerError); break; default: help(MAIN, '-'); break; } getOptions(distanceFlag, statisticsFlag, featureFlag, outlierCriterion, alleleFlag, repeatLength, primerError, mutationRate, bootstraps, printFlag, infileName, outfileName); } void main(int argc, char *argv[]) { FILE *infile, *outfile; /* Strings */ char buffer[BUFFSIZE]; char *token, *infileName, *outfileName; /* Flags */ int distanceFlag, statisticsFlag, alleleFlag, printFlag, frequencyFlag, generateFlag, featureFlag; /* Scalars */ int i, j, k, ii, jj, bootstrap; int locus, loci, taxon, taxa; int character, freq; int repeatLength, primerError, bootstraps; int min, max, med, mod, repeat, poly; int loHinge, hiHinge, outlierCriterion; float diff1, tau1tot, tau1avg, tau1, diff2, tau2tot, tau2avg, tau2; float Mr, Mravg, Mrtot, Mf, Mfavg, Mftot; float mutationRate, minCrit, maxCrit, meanSize, sdSize; float varFstVar, varFstHet, varHet , varHetTot, varVar, varVarTot; float varAll, varAllTot, varRan, varRanTot, varMax, varMaxTot; float totFstVar, totFstHet, totHet, totHetTot, totVar, totVarTot; float totAll, totAllTot, totRan, totRanTot, totMax, totMaxTot; float varEnt, varEntTot, totEnt, totEntTot, varRstStd, totRstStd; float avg, totAvg, totMed, totMod, totMin; /* Vectors */ int *bootLocus; float *variance; /* Matrices */ float **distance, **meanDistance, **sdDistance; /* Structures */ struct Lexicon *taxonNames, *locusNames, *iNames; struct Bag ***data, ***rawData; struct Bag **locusData, **taxonData, **iData; struct Bag *total; /* Initialize */ infile=stdin; outfile=stdout; mutationRate=DEFAULTMUTATION; repeatLength=DEFAULTREPEAT; primerError=DEFAULTPRIMER; outlierCriterion=ERROR; distanceFlag=statisticsFlag=featureFlag=alleleFlag=printFlag=0; bootstraps=1; infileName=myalloc(BUFFSIZE); outfileName=myalloc(BUFFSIZE); strcpy(infileName, "stdin"); strcpy(outfileName, "stdout"); do { getOptions(&distanceFlag, &statisticsFlag, &featureFlag, &outlierCriterion, &alleleFlag, &repeatLength, &primerError, &mutationRate, &bootstraps, &printFlag, infileName, outfileName); if (strcmp(infileName, "stdin")) { infile=fopen(infileName, "r"); if (infile==NULL) printf("Can't find data file named %s\n", infileName); } if (strcmp(outfileName, "stdout")) { outfile=fopen(outfileName, "w"); if (outfile==NULL) printf("Can't create output file named %s\n", outfileName); } } while (infile==NULL||outfile==NULL); /* Allocate some structs */ taxonNames=initLexicon(); locusNames=initLexicon(); /* Count the taxa and loci, and build the lexica */ taxa=loci=0; while (fgets(buffer, BUFFSIZE, infile)) if ((buffer[0]!='%')&&fieldCheck(buffer)) { token=strtok(buffer, " \t\n"); taxon=getLexWord(taxonNames, token); taxa=MAX(taxon, taxa); token=strtok(NULL, " \t\n"); if (!(locus=findLexWord(locusNames, token))) { locus=getLexWord(locusNames, token); loci=MAX(locus, loci); } } /* Initialize the arrays */ distance=myalloc(taxa*sizeof(float*)); meanDistance=myalloc(taxa*sizeof(float*)); sdDistance=myalloc(taxa*sizeof(float*)); for (i=0; i1) { bootstrap=MIN(BIGINT, binomial(2*loci-1, loci)); generateFlag=0; /* Query if requested bootstraps are not likely to sample completely */ k=bootstrap*pow(1.0-1.0/bootstrap, bootstraps); if (bootstrap0) { printf("There are only %d possible bootstrap samplings for %d loci.\n", bootstrap, loci); if (k) printf("%d bootstraps will probably include only ~%d of them.\n", bootstraps, bootstrap-k); printf("Do you want them generated exhaustively instead? "); gets(buffer); if (toupper(buffer[0])=='Y') { generateFlag=1; bootstraps=bootstrap; } } } /* Check for conforming repeat lengths */ for (locus=0; locus1)&& (repeat=minimumIncrement(locusData[locus], min, max))!=repeatLength) { printf("Warning: locus %s appears to have a repeat length of %d\n", lexWord(locusNames, locus+1), repeat); showIncrement(locusData[locus], min, max, repeat); printf("Treat it as repeat length %d? ", repeat); gets(buffer); if (toupper(buffer[0])!='Y') repeat=repeatLength; } /* Check for short alleles */ if ((min-primerError)<0) { printf("Error: locus %s has alleles shorter than the primer error of %d.\nPrimer error will be decreased to %d.\n", lexWord(locusNames, locus+1), primerError, min); primerError=min; } /* Convert data to repeat score form */ for (taxon=0; taxonminCrit&&j0)^ (frequency(data[j][locus], k)>0)); fprintf(outfile, "%10.10s%10d%10.3f\n", lexWord(locusNames, locus+1), freq, 50.0*freq/((taxa*taxa-taxa)*bagCardinality(locusData[locus]))); } } /* Want distances? */ if (distanceFlag) { /* If distances are wanted by locus, hack some parameters */ if (featureFlag&BYLOCUS) { bootstraps=loci; printFlag=ALL; } /* Rst is a special case: we have to pass the variance for each locus */ if (distanceFlag==SR) { variance=myalloc(loci*sizeof(float)); for (locus=0; locus1) if (generateFlag) if (bootstrap) { i=0; do { bootLocus[i]=(bootLocus[i]+1)%loci; } while (!bootLocus[i++]); locus=bootLocus[i-1]; do { bootLocus[--i]=locus; } while (i); } else for (locus=0; locus1) sdDistance[i][j]=sqrt((sdDistance[i][j]- bootstraps*meanDistance[i][j]*meanDistance[i][j])/(bootstraps-1)); } /* Print distance matrix in PHYLIP format */ if (printFlag==MEAN) { fprintf(outfile, "%d\n", taxa); for (i=0; i1) { fprintf(outfile, "\nStandard errors of distances:\n"); for (i=0; i1) { ii++; diff1=bagMaximum(locusData[locus])-bagMinimum(locusData[locus]); diff2=bagMaximum(locusData[locus]); tau1avg+=diff1; tau2avg+=diff2; tau1=TAU(diff1); tau2=TAU(diff2); tau1tot+=tau1; tau2tot+=tau2; jj=0; for (taxon=0; taxonmax) max=min; /* Print the taxon names vertically */ for (i=0; i1) { poly++; varHet=varVar=varAll=varRan=varMax=varEnt=k=0; for (j=0; j